Pranavkk commited on
Commit
6e2b3ef
·
verified ·
1 Parent(s): 5039886

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. AGENTS.md +56 -0
  2. CLAUDE.md +1 -0
  3. Dockerfile +13 -1
  4. README.md +1 -1
  5. __init__.py +25 -25
  6. agent_smoke.py +80 -0
  7. client.py +143 -140
  8. control/__init__.py +2 -2
  9. control/kubernetes_executor.py +396 -230
  10. control/validation.py +69 -38
  11. curriculum.py +131 -0
  12. deploy-local.ps1 +91 -0
  13. deploy/LOCAL_LAPTOP_FASTAPI_GUIDE.md +74 -0
  14. deploy/aws/ARCHITECTURE.md +361 -0
  15. deploy/aws/FASTAPI_AWS_MODE_GUIDE.md +72 -0
  16. deploy/aws/OPERATIONS.md +465 -0
  17. deploy/aws/README.md +361 -0
  18. deploy/aws/cluster-autoscaler-values.yaml +57 -0
  19. deploy/aws/deploy-all.ps1 +493 -0
  20. deploy/aws/deploy.ps1 +369 -0
  21. deploy/aws/deploy.sh +204 -0
  22. deploy/aws/eksctl-cluster.yaml +58 -0
  23. deploy/aws/generate-kubeconfig.ps1 +131 -0
  24. deploy/aws/generate-kubeconfig.sh +138 -0
  25. deploy/aws/grafana-trust-policy.json +12 -0
  26. deploy/aws/grafana-values.yaml +68 -0
  27. deploy/aws/k8s-workloads.yaml +296 -0
  28. deploy/aws/kubeconfig-antiatropos.yaml +34 -0
  29. deploy/aws/prometheus-agent-values.yaml +95 -0
  30. deploy/aws/teardown-all.ps1 +242 -0
  31. deploy/do/README.md +92 -0
  32. deploy/do/antiatropos-control.service +16 -0
  33. deploy/do/deploy-droplet-one-shot.sh +183 -0
  34. deploy/do/uninstall-legacy-openenv.sh +25 -0
  35. deploy/entrypoint.sh +71 -62
  36. deploy/grafana-datasource-local.yaml +11 -0
  37. deploy/grafana-helm-values.yaml +46 -0
  38. deploy/grafana/grafana.ini +21 -21
  39. deploy/grafana/provisioning/dashboards/dashboard.yaml +12 -12
  40. deploy/grafana/provisioning/dashboards/json/antiatropos-live.json +334 -334
  41. deploy/grafana/provisioning/dashboards/json/antiatropos-overview.json +21 -16
  42. deploy/grafana/provisioning/dashboards/json/antiatropos-workloads.json +436 -0
  43. deploy/grafana/provisioning/datasources/prometheus.yaml +2 -2
  44. deploy/index.html +473 -473
  45. deploy/kind-maxpods-250.yaml +11 -0
  46. deploy/local-laptop.yaml +365 -0
  47. deploy/local/datasource-local.yaml +10 -0
  48. deploy/local/grafana-local-values.yaml +34 -0
  49. deploy/local/prometheus-local-values.yaml +49 -0
  50. deploy/nginx.conf +89 -89
AGENTS.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AntiAtropos: The Physics of Autonomous SRE
2
+
3
+ > **"Infrastructure is not a static set of configurations; it is a dynamic system of energy, flow, and stability."**
4
+
5
+ ## The Vision
6
+ AntiAtropos is a next-generation **Autonomous SRE (Site Reliability Engineering) Control Environment**. While traditional DevOps relies on static thresholds (e.g., "if CPU > 80%"), AntiAtropos treats a microservice cluster as a **Physics Engine**.
7
+
8
+ Our vision is to move from reactive scripts to **Dynamical System Control**. We are building an environment where AI agents don't just "fix things"—they balance the "Potential Energy" of a cluster to maintain equilibrium under extreme pressure.
9
+
10
+ ---
11
+
12
+ ## 1. The Physics Engine Concept
13
+ Traditional observability measures metrics; we measure **Stability**. We have modeled our 5-node cluster using **Fluid Queue Dynamics**, treating request flow like water and nodes like reservoirs.
14
+
15
+ ### The Lyapunov Potential ($V$)
16
+ The "North Star" of our environment is the **Lyapunov Energy Function**:
17
+ $$V(s) = \sum_{i=1}^{N} w_i \cdot Q_i^2$$
18
+ * **$Q_i$ (Queue Depth):** The "Potential Energy" or mass accumulated in a service.
19
+ * **$w_i$ (Weight):** The "Gravity" or business importance (node-0 is the VIP Payment Gateway).
20
+ * **Cascading Failures:** Our physics engine models "Backlog Pressure," where one failing node can trigger a chain reaction across its neighbors.
21
+
22
+ ### Advanced Latency Dynamics (M/M/1)
23
+ We move beyond linear latency models. AntiAtropos implements a **"Hockey-Stick" Latency Curve**. As utilization approaches 100%, latency increases exponentially—modeling the "Point of No Return" that real-world on-call engineers fear.
24
+
25
+ ---
26
+
27
+ ## 2. Training Strategy: The Professional Loop
28
+ To build a hackathon-winning agent, we use a complex training pipeline coordinated between **Google Colab** and **Hugging Face**:
29
+
30
+ ### Progressive Curriculum Learning
31
+ Agents are not trained at random. They follow a **Curriculum** (`curriculum.py`) that graduates them through increasingly difficult stages:
32
+ 1. **Stage 1-3:** Capacity Ramping (Learning to scale).
33
+ 2. **Stage 4-5:** Fault Tolerance (Learning to reroute).
34
+ 3. **Stage 6-8:** Surge Stability (Learning to balance competing pressures).
35
+ 4. **Finals:** Sustained protection under cascading failure conditions.
36
+
37
+ ### Episodic Replay Buffer
38
+ Using `replay.py`, our agents maintain a "Long-term Memory" of **Key Transitions**. Instead of relearning from scratch, the model uses **Few-Shot Demonstrations** to see how successful previous strategies were executed.
39
+
40
+ ---
41
+
42
+ ## 3. Upcoming & Unconfirmed Roadmap
43
+ > [!IMPORTANT]
44
+ > **DISCLAIMER:** The following features are in the research phase and are NOT yet finalized or confirmed. Please consult with the core team before assuming implementation details.
45
+
46
+ * **Multi-Token Attention for SRE:** Investigating the use of frequency-selective transformation to capture "cluster breathiness" (p99 jitter) rather than just global averages.
47
+ * **Graph Neural Network (GNN) Control:** Potential pivot toward modeling the cluster as a dynamic graph to directly manage the "topology of stress."
48
+ * **Cross-Cluster Generalization:** Testing models trained on 5 nodes against 10 and 20 node environments.
49
+
50
+ ---
51
+
52
+ ## Why This Wins
53
+ AntiAtropos doesn't follow runbooks. It understands the **laws of motion** within a cluster. By training agents to minimize "System Energy," we create infrastructure that is inherently self-healing, cost-efficient, and mathematically stable.
54
+
55
+ ---
56
+ *Created for the 2026 AntiAtropos Hackathon.*
CLAUDE.md ADDED
@@ -0,0 +1 @@
 
 
1
+ Refer to AGENT.md for instructions
Dockerfile CHANGED
@@ -6,7 +6,19 @@ ENV DEBIAN_FRONTEND=noninteractive \
6
  PROMETHEUS_VERSION=3.5.1 \
7
  GRAFANA_VERSION=12.3.1 \
8
  PROMETHEUS_ARCH=linux-amd64 \
9
- GRAFANA_ARCH=linux-amd64
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  RUN apt-get update && apt-get install -y --no-install-recommends \
12
  bash \
 
6
  PROMETHEUS_VERSION=3.5.1 \
7
  GRAFANA_VERSION=12.3.1 \
8
  PROMETHEUS_ARCH=linux-amd64 \
9
+ GRAFANA_ARCH=linux-amd64 \
10
+ ANTIATROPOS_ENV_MODE=live \
11
+ ANTIATROPOS_REWARD_OUTPUT_MODE=normalized \
12
+ ANTIATROPOS_CONTROL_TIMEOUT_S=8.0 \
13
+ ANTIATROPOS_PROM_TIMEOUT_S=5.0 \
14
+ ANTIATROPOS_STRICT_REAL=false \
15
+ ANTIATROPOS_METRIC_AGGREGATION=sum \
16
+ ANTIATROPOS_K8S_NAMESPACE=prod-sre \
17
+ ANTIATROPOS_MIN_REPLICAS=1 \
18
+ ANTIATROPOS_SCALE_STEP=3 \
19
+ ANTIATROPOS_CONTROL_PLANE_URL=http://206.189.136.21:8010 \
20
+ PROMETHEUS_URL=http://206.189.136.21:30090 \
21
+ ANTIATROPOS_WORKLOAD_MAP={"node-0":{"deployment":"payments","namespace":"prod-sre"},"node-1":{"deployment":"checkout","namespace":"prod-sre"},"node-2":{"deployment":"catalog","namespace":"prod-sre"},"node-3":{"deployment":"cart","namespace":"prod-sre"},"node-4":{"deployment":"auth","namespace":"prod-sre"}}
22
 
23
  RUN apt-get update && apt-get install -y --no-install-recommends \
24
  bash \
README.md CHANGED
@@ -274,4 +274,4 @@ For fixed-seed studies, use controlled simulator seeding in evaluation harnesses
274
  | Grader quality | Deterministic, interpretable composite score in `[0, 1]` |
275
  | Environment design | Dense Lyapunov-grounded reward, clean reset/step loop, explicit episode boundaries |
276
  | Code quality | Typed Pydantic models, modular components, OpenEnv manifest, containerized runtime |
277
- | Novelty | Lyapunov reward shaping + live K8s control plane + Prometheus telemetry + observability-first design |
 
274
  | Grader quality | Deterministic, interpretable composite score in `[0, 1]` |
275
  | Environment design | Dense Lyapunov-grounded reward, clean reset/step loop, explicit episode boundaries |
276
  | Code quality | Typed Pydantic models, modular components, OpenEnv manifest, containerized runtime |
277
+ | Novelty | Lyapunov reward shaping + live K8s control plane + Prometheus telemetry + observability-first design |
__init__.py CHANGED
@@ -1,25 +1,25 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """AntiAtropos Environment."""
8
-
9
- from .client import AntiAtroposEnv
10
- from .models import (
11
- SREAction,
12
- ActionType,
13
- ClusterObservation,
14
- NodeObservation,
15
- NodeStatus,
16
- )
17
-
18
- __all__ = [
19
- "AntiAtroposEnv",
20
- "SREAction",
21
- "ActionType",
22
- "ClusterObservation",
23
- "NodeObservation",
24
- "NodeStatus",
25
- ]
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """AntiAtropos Environment."""
8
+
9
+ from .client import AntiAtroposEnv
10
+ from .models import (
11
+ SREAction,
12
+ ActionType,
13
+ ClusterObservation,
14
+ NodeObservation,
15
+ NodeStatus,
16
+ )
17
+
18
+ __all__ = [
19
+ "AntiAtroposEnv",
20
+ "SREAction",
21
+ "ActionType",
22
+ "ClusterObservation",
23
+ "NodeObservation",
24
+ "NodeStatus",
25
+ ]
agent_smoke.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Quick autonomous agent smoke test against the running AntiAtropos FastAPI server.
4
+
5
+ This does NOT require an LLM API key.
6
+ It uses a simple heuristic policy to validate end-to-end control-plane + telemetry wiring.
7
+ """
8
+
9
+ import asyncio
10
+ import os
11
+ from dataclasses import dataclass
12
+
13
+ try:
14
+ from AntiAtropos.client import AntiAtroposEnv
15
+ from AntiAtropos.models import SREAction, ActionType
16
+ except ImportError:
17
+ from client import AntiAtroposEnv # type: ignore
18
+ from models import SREAction, ActionType # type: ignore
19
+
20
+
21
+ @dataclass
22
+ class Config:
23
+ env_url: str = os.getenv("ENV_URL", "http://localhost:8000")
24
+ task_id: str = os.getenv("ANTIATROPOS_TASK", "task-1")
25
+ mode: str = os.getenv("ANTIATROPOS_MODE", os.getenv("ANTIATROPOS_ENV_MODE", "aws"))
26
+ max_steps: int = int(os.getenv("ANTIATROPOS_SMOKE_STEPS", "20"))
27
+
28
+
29
+ def pick_action(obs) -> SREAction:
30
+ # Pick node with highest queue depth as target
31
+ target = max(obs.nodes, key=lambda n: float(getattr(n, "queue_depth", 0.0)))
32
+
33
+ avg_latency = float(getattr(obs, "average_latency_ms", 0.0))
34
+ backlog = float(getattr(obs, "total_queue_backlog", 0.0))
35
+
36
+ # Heuristic policy:
37
+ # - If stressed, scale up busiest node
38
+ # - If very calm, scale down non-VIP node
39
+ # - Otherwise no-op
40
+ if avg_latency > 0.20 or backlog > 0.45:
41
+ return SREAction(action_type=ActionType.SCALE_UP, target_node_id=target.node_id, parameter=0.6)
42
+
43
+ non_vips = [n for n in obs.nodes if not bool(getattr(n, "is_vip", False))]
44
+ if avg_latency < 0.08 and backlog < 0.15 and non_vips:
45
+ down_target = max(non_vips, key=lambda n: float(getattr(n, "capacity", 0.0)))
46
+ return SREAction(action_type=ActionType.SCALE_DOWN, target_node_id=down_target.node_id, parameter=0.4)
47
+
48
+ return SREAction(action_type=ActionType.NO_OP, target_node_id=target.node_id, parameter=0.0)
49
+
50
+
51
+ async def main() -> None:
52
+ cfg = Config()
53
+ print(f"[agent-smoke] env={cfg.env_url} task={cfg.task_id} mode={cfg.mode} steps={cfg.max_steps}")
54
+
55
+ async with AntiAtroposEnv(cfg.env_url, message_timeout_s=120) as env:
56
+ result = await env.reset(task_id=cfg.task_id, mode=cfg.mode)
57
+ print(f"[reset] step={result.observation.step} latency={result.observation.average_latency_ms:.3f} backlog={result.observation.total_queue_backlog:.3f}")
58
+
59
+ rewards = []
60
+ for i in range(1, cfg.max_steps + 1):
61
+ action = pick_action(result.observation)
62
+ result = await env.step(action)
63
+ rewards.append(float(result.reward or 0.0))
64
+ ack = getattr(result.observation, "action_ack_status", "")
65
+ print(
66
+ f"[step {i:02d}] {action.action_type.value} {action.target_node_id} p={action.parameter:.2f} "
67
+ f"reward={float(result.reward or 0.0):.3f} done={bool(result.done)} ack={ack}"
68
+ )
69
+ if result.done:
70
+ break
71
+
72
+ if rewards:
73
+ avg_reward = sum(rewards) / len(rewards)
74
+ print(f"[done] steps={len(rewards)} avg_reward={avg_reward:.3f} final_latency={result.observation.average_latency_ms:.3f} final_backlog={result.observation.total_queue_backlog:.3f}")
75
+ else:
76
+ print("[done] no steps executed")
77
+
78
+
79
+ if __name__ == "__main__":
80
+ asyncio.run(main())
client.py CHANGED
@@ -1,140 +1,143 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """AntiAtropos Environment Client."""
8
-
9
- from typing import Dict
10
-
11
- from openenv.core import EnvClient
12
- from openenv.core.client_types import StepResult
13
- from openenv.core.env_server.types import State
14
-
15
- from .models import SREAction, ClusterObservation, NodeObservation, NodeStatus
16
-
17
-
18
- class AntiAtroposEnv(
19
- EnvClient[SREAction, ClusterObservation, State]
20
- ):
21
- """
22
- Client for the AntiAtropos Environment.
23
-
24
- This client maintains a persistent WebSocket connection to the environment server,
25
- enabling efficient multi-step interactions with lower latency.
26
- Each client instance has its own dedicated environment session on the server.
27
-
28
- Example:
29
- >>> # Connect to a running server
30
- >>> with AntiAtroposEnv(base_url="http://localhost:8000") as client:
31
- ... result = client.reset()
32
- ... print(result.observation.average_latency_ms)
33
- ...
34
- ... action = SREAction(action_type="SCALE_UP", target_node_id="node-0", parameter=2.0)
35
- ... result = client.step(action)
36
- ... print(result.observation.lyapunov_energy)
37
-
38
- Example with Docker:
39
- >>> # Automatically start container and connect
40
- >>> client = AntiAtroposEnv.from_docker_image("AntiAtropos-env:latest")
41
- >>> try:
42
- ... result = client.reset()
43
- ... result = client.step(SREAction(action_type="NO_OP"))
44
- ... finally:
45
- ... client.close()
46
- """
47
-
48
- def _step_payload(self, action: SREAction) -> Dict:
49
- """
50
- Convert SREAction to JSON payload for step message.
51
-
52
- Args:
53
- action: SREAction instance
54
-
55
- Returns:
56
- Dictionary representation suitable for JSON encoding
57
- """
58
- return {
59
- "action_type": action.action_type.value,
60
- "target_node_id": action.target_node_id,
61
- "parameter": action.parameter,
62
- }
63
-
64
- def _parse_result(self, payload: Dict) -> StepResult[ClusterObservation]:
65
- """
66
- Parse server response into StepResult[ClusterObservation].
67
-
68
- Args:
69
- payload: JSON response data from server
70
-
71
- Returns:
72
- StepResult with ClusterObservation
73
- """
74
- obs_data = payload.get("observation", {})
75
-
76
- # Parse per-node list into NodeObservation objects
77
- raw_nodes = obs_data.get("nodes", [])
78
- node_obs = [
79
- NodeObservation(
80
- node_id=n.get("node_id", ""),
81
- status=NodeStatus(n.get("status", NodeStatus.HEALTHY)),
82
- is_vip=n.get("is_vip", False),
83
- queue_depth=n.get("queue_depth", 0),
84
- latency_ms=n.get("latency_ms", 0.0),
85
- incoming_request_rate=n.get("incoming_request_rate", 0.0),
86
- cpu_utilization=n.get("cpu_utilization", 0.0),
87
- importance_weight=n.get("importance_weight", 1.0),
88
- done=n.get("done", False),
89
- reward=n.get("reward", 0.0),
90
- )
91
- for n in raw_nodes
92
- ]
93
-
94
- observation = ClusterObservation(
95
- cluster_id=obs_data.get("cluster_id", ""),
96
- task_id=obs_data.get("task_id", "task-1"),
97
- mode=obs_data.get("mode", "simulated"),
98
- active_nodes=obs_data.get("active_nodes", 0),
99
- average_latency_ms=obs_data.get("average_latency_ms", 0.0),
100
- error_rate=obs_data.get("error_rate", 0.0),
101
- total_queue_backlog=obs_data.get("total_queue_backlog", 0),
102
- current_cost_per_hour=obs_data.get("current_cost_per_hour", 0.0),
103
- lyapunov_energy=obs_data.get("lyapunov_energy", 0.0),
104
- nodes=node_obs,
105
- step=obs_data.get("step", 0),
106
- max_steps=obs_data.get("max_steps", 100),
107
- sla_violations=obs_data.get("sla_violations", 0),
108
- invalid_action_count=obs_data.get("invalid_action_count", 0),
109
- vip_failure_count=obs_data.get("vip_failure_count", 0),
110
- metric_timestamp=obs_data.get("metric_timestamp", 0.0),
111
- data_freshness_ms=obs_data.get("data_freshness_ms", 0),
112
- action_ack_status=obs_data.get("action_ack_status", "success"),
113
- choke_level=obs_data.get("choke_level", 0.0),
114
- raw_reward=obs_data.get("raw_reward", 0.0),
115
- normalized_reward=obs_data.get("normalized_reward", 0.0),
116
- reward_scale_version=obs_data.get("reward_scale_version", "sigmoid-v1"),
117
- done=payload.get("done", False),
118
- reward=payload.get("reward", 0.0),
119
- )
120
-
121
- return StepResult(
122
- observation=observation,
123
- reward=payload.get("reward", 0.0),
124
- done=payload.get("done", False),
125
- )
126
-
127
- def _parse_state(self, payload: Dict) -> State:
128
- """
129
- Parse server response into State object.
130
-
131
- Args:
132
- payload: JSON response from state request
133
-
134
- Returns:
135
- State object with episode_id and step_count
136
- """
137
- return State(
138
- episode_id=payload.get("episode_id"),
139
- step_count=payload.get("step_count", 0),
140
- )
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """AntiAtropos Environment Client."""
8
+
9
+ from typing import Dict
10
+
11
+ from openenv.core import EnvClient
12
+ from openenv.core.client_types import StepResult
13
+ from openenv.core.env_server.types import State
14
+
15
+ try:
16
+ from .models import SREAction, ClusterObservation, NodeObservation, NodeStatus
17
+ except ImportError:
18
+ from models import SREAction, ClusterObservation, NodeObservation, NodeStatus # type: ignore
19
+
20
+
21
+ class AntiAtroposEnv(
22
+ EnvClient[SREAction, ClusterObservation, State]
23
+ ):
24
+ """
25
+ Client for the AntiAtropos Environment.
26
+
27
+ This client maintains a persistent WebSocket connection to the environment server,
28
+ enabling efficient multi-step interactions with lower latency.
29
+ Each client instance has its own dedicated environment session on the server.
30
+
31
+ Example:
32
+ >>> # Connect to a running server
33
+ >>> with AntiAtroposEnv(base_url="http://localhost:8000") as client:
34
+ ... result = client.reset()
35
+ ... print(result.observation.average_latency_ms)
36
+ ...
37
+ ... action = SREAction(action_type="SCALE_UP", target_node_id="node-0", parameter=2.0)
38
+ ... result = client.step(action)
39
+ ... print(result.observation.lyapunov_energy)
40
+
41
+ Example with Docker:
42
+ >>> # Automatically start container and connect
43
+ >>> client = AntiAtroposEnv.from_docker_image("AntiAtropos-env:latest")
44
+ >>> try:
45
+ ... result = client.reset()
46
+ ... result = client.step(SREAction(action_type="NO_OP"))
47
+ ... finally:
48
+ ... client.close()
49
+ """
50
+
51
+ def _step_payload(self, action: SREAction) -> Dict:
52
+ """
53
+ Convert SREAction to JSON payload for step message.
54
+
55
+ Args:
56
+ action: SREAction instance
57
+
58
+ Returns:
59
+ Dictionary representation suitable for JSON encoding
60
+ """
61
+ return {
62
+ "action_type": action.action_type.value,
63
+ "target_node_id": action.target_node_id,
64
+ "parameter": action.parameter,
65
+ }
66
+
67
+ def _parse_result(self, payload: Dict) -> StepResult[ClusterObservation]:
68
+ """
69
+ Parse server response into StepResult[ClusterObservation].
70
+
71
+ Args:
72
+ payload: JSON response data from server
73
+
74
+ Returns:
75
+ StepResult with ClusterObservation
76
+ """
77
+ obs_data = payload.get("observation", {})
78
+
79
+ # Parse per-node list into NodeObservation objects
80
+ raw_nodes = obs_data.get("nodes", [])
81
+ node_obs = [
82
+ NodeObservation(
83
+ node_id=n.get("node_id", ""),
84
+ status=NodeStatus(n.get("status", NodeStatus.HEALTHY)),
85
+ is_vip=n.get("is_vip", False),
86
+ queue_depth=n.get("queue_depth", 0),
87
+ latency_ms=n.get("latency_ms", 0.0),
88
+ incoming_request_rate=n.get("incoming_request_rate", 0.0),
89
+ cpu_utilization=n.get("cpu_utilization", 0.0),
90
+ importance_weight=n.get("importance_weight", 1.0),
91
+ done=n.get("done", False),
92
+ reward=n.get("reward", 0.0),
93
+ )
94
+ for n in raw_nodes
95
+ ]
96
+
97
+ observation = ClusterObservation(
98
+ cluster_id=obs_data.get("cluster_id", ""),
99
+ task_id=obs_data.get("task_id", "task-1"),
100
+ mode=obs_data.get("mode", "simulated"),
101
+ active_nodes=obs_data.get("active_nodes", 0),
102
+ average_latency_ms=obs_data.get("average_latency_ms", 0.0),
103
+ error_rate=obs_data.get("error_rate", 0.0),
104
+ total_queue_backlog=obs_data.get("total_queue_backlog", 0),
105
+ current_cost_per_hour=obs_data.get("current_cost_per_hour", 0.0),
106
+ lyapunov_energy=obs_data.get("lyapunov_energy", 0.0),
107
+ nodes=node_obs,
108
+ step=obs_data.get("step", 0),
109
+ max_steps=obs_data.get("max_steps", 100),
110
+ sla_violations=obs_data.get("sla_violations", 0),
111
+ invalid_action_count=obs_data.get("invalid_action_count", 0),
112
+ vip_failure_count=obs_data.get("vip_failure_count", 0),
113
+ metric_timestamp=obs_data.get("metric_timestamp", 0.0),
114
+ data_freshness_ms=obs_data.get("data_freshness_ms", 0),
115
+ action_ack_status=obs_data.get("action_ack_status", "success"),
116
+ choke_level=obs_data.get("choke_level", 0.0),
117
+ raw_reward=obs_data.get("raw_reward", 0.0),
118
+ normalized_reward=obs_data.get("normalized_reward", 0.0),
119
+ reward_scale_version=obs_data.get("reward_scale_version", "sigmoid-v1"),
120
+ done=payload.get("done", False),
121
+ reward=payload.get("reward", 0.0),
122
+ )
123
+
124
+ return StepResult(
125
+ observation=observation,
126
+ reward=payload.get("reward", 0.0),
127
+ done=payload.get("done", False),
128
+ )
129
+
130
+ def _parse_state(self, payload: Dict) -> State:
131
+ """
132
+ Parse server response into State object.
133
+
134
+ Args:
135
+ payload: JSON response from state request
136
+
137
+ Returns:
138
+ State object with episode_id and step_count
139
+ """
140
+ return State(
141
+ episode_id=payload.get("episode_id"),
142
+ step_count=payload.get("step_count", 0),
143
+ )
control/__init__.py CHANGED
@@ -1,2 +1,2 @@
1
- from .kubernetes_executor import KubernetesExecutor
2
- from .validation import ActionValidator
 
1
+ from .kubernetes_executor import KubernetesExecutor
2
+ from .validation import ActionValidator
control/kubernetes_executor.py CHANGED
@@ -1,230 +1,396 @@
1
- import os
2
- import json
3
- import time
4
- from uuid import uuid4
5
- from typing import Optional
6
-
7
- class KubernetesExecutor:
8
- """
9
- Executes high-level SRE actions on a Kubernetes cluster.
10
- Provides a safe layer between SREAgent and actual infrastructure.
11
- """
12
- def __init__(self, kubeconfig: Optional[str] = None):
13
- # Use provided path or env var, defaulting to mock if neither is found
14
- self.kubeconfig = kubeconfig or os.getenv("KUBECONFIG")
15
- self.is_mock = not self.kubeconfig or self.kubeconfig.lower() == "mock"
16
- self.namespace = os.getenv("ANTIATROPOS_K8S_NAMESPACE", "default")
17
- self.min_replicas = int(os.getenv("ANTIATROPOS_MIN_REPLICAS", "1"))
18
- self.max_replicas = int(os.getenv("ANTIATROPOS_MAX_REPLICAS", "20"))
19
- self.scale_step = int(os.getenv("ANTIATROPOS_SCALE_STEP", "3"))
20
- self._apps_v1_api = None
21
- self._node_workload_map = self._load_node_workload_map()
22
- self._live_supported_actions = {"NO_OP", "SCALE_UP", "SCALE_DOWN"}
23
-
24
- @staticmethod
25
- def _normalize_action_type(action_type) -> str:
26
- if hasattr(action_type, "value"):
27
- return str(action_type.value)
28
- return str(action_type)
29
-
30
- def execute(self, action_type: str, target: str, parameter: float) -> str:
31
- """
32
- Translates SRE actions to Kube requests (ScaleDeployment, PatchIngress, etc.)
33
- """
34
- return self.execute_with_metadata(action_type, target, parameter)["ack_status"]
35
-
36
- def execute_with_metadata(self, action_type: str, target: str, parameter: float) -> dict:
37
- """
38
- Execute action and return acknowledgement plus executor metadata.
39
- """
40
- action_id = str(uuid4())
41
- started = time.perf_counter()
42
- ack_status = ""
43
- error_code = ""
44
-
45
- if self.is_mock:
46
- ack_status = self._mock_execution(action_type, target, parameter)
47
- else:
48
- try:
49
- ack_status = self._real_execution(action_type, target, parameter)
50
- except Exception as e:
51
- ack_status = f"Error: Failed to execute {action_type} on {target}: {str(e)}"
52
- error_code = "EXECUTION_ERROR"
53
-
54
- if ack_status.startswith("Rejected:") and not error_code:
55
- error_code = "REJECTED_ACTION"
56
- elif ack_status.startswith("Error:") and not error_code:
57
- error_code = "EXECUTION_ERROR"
58
-
59
- latency_ms = (time.perf_counter() - started) * 1000.0
60
- return {
61
- "action_id": action_id,
62
- "ack_status": ack_status,
63
- "executor_latency_ms": latency_ms,
64
- "executor_error_code": error_code,
65
- }
66
-
67
- def live_enabled_actions(self) -> set[str]:
68
- """Action types that are actually executable in real live mode."""
69
- if self.is_mock:
70
- return {"NO_OP"}
71
- return set(self._live_supported_actions)
72
-
73
- def live_capability_error(self, action_type: str) -> Optional[str]:
74
- """Returns reason when action is not runnable in live mode, else None."""
75
- action = self._normalize_action_type(action_type)
76
- if action not in self.live_enabled_actions():
77
- if self.is_mock:
78
- return (
79
- f"Live mode rejected {action}: no real Kubernetes executor is configured "
80
- "(set KUBECONFIG and ANTIATROPOS_WORKLOAD_MAP)."
81
- )
82
- return f"Live mode rejected {action}: no executor is enabled for this action."
83
- return None
84
-
85
- def _real_execution(self, action_type: str, target: str, parameter: float) -> str:
86
- """Execute bounded actions on a Kubernetes cluster."""
87
- action = self._normalize_action_type(action_type)
88
-
89
- if action == "NO_OP":
90
- return "Ack: NO_OP - no cluster mutation"
91
-
92
- if action in ("SCALE_UP", "SCALE_DOWN"):
93
- return self._scale_deployment(action, target, parameter)
94
-
95
- return f"Rejected: {action} is not enabled for live Kubernetes execution"
96
-
97
- def _mock_execution(self, action_type: str, target: str, parameter: float) -> str:
98
- """Returns mock acknowledgement for actions."""
99
- # TODO: Add realistic latency simulation for K8s control plane
100
- action = self._normalize_action_type(action_type)
101
- return f"Ack: {action} for {target} with value {parameter} - Status: Applied"
102
-
103
- def _scale_deployment(self, action_type: str, target: str, parameter: float) -> str:
104
- namespace, deployment_name = self._resolve_workload_target(target)
105
- apps_v1 = self._get_apps_v1_api()
106
-
107
- scale_obj = apps_v1.read_namespaced_deployment_scale(
108
- name=deployment_name,
109
- namespace=namespace,
110
- )
111
-
112
- current = int(scale_obj.spec.replicas or self.min_replicas)
113
- delta = max(1, int(float(parameter) * self.scale_step))
114
- if action_type == "SCALE_UP":
115
- desired = min(self.max_replicas, current + delta)
116
- else:
117
- desired = max(self.min_replicas, current - delta)
118
-
119
- if desired == current:
120
- return (
121
- f"Ack: {action_type} for {target} - replicas unchanged at {current} "
122
- f"(bounds {self.min_replicas}-{self.max_replicas})"
123
- )
124
-
125
- apps_v1.patch_namespaced_deployment_scale(
126
- name=deployment_name,
127
- namespace=namespace,
128
- body={"spec": {"replicas": desired}},
129
- )
130
-
131
- return (
132
- f"Ack: {action_type} for {target} - deployment {deployment_name} "
133
- f"in namespace {namespace} scaled {current}->{desired}"
134
- )
135
-
136
- def _get_apps_v1_api(self):
137
- if self._apps_v1_api is not None:
138
- return self._apps_v1_api
139
-
140
- from kubernetes import client, config
141
-
142
- if self.kubeconfig and self.kubeconfig.lower() not in ("mock", ""):
143
- config.load_kube_config(config_file=self.kubeconfig)
144
- else:
145
- config.load_incluster_config()
146
-
147
- self._apps_v1_api = client.AppsV1Api()
148
- return self._apps_v1_api
149
-
150
- def _load_node_workload_map(self) -> dict[str, dict[str, str]]:
151
- """
152
- Load node->workload mapping.
153
-
154
- Preferred format (ANTIATROPOS_WORKLOAD_MAP):
155
- {
156
- "node-0": {"deployment": "payments", "namespace": "prod-sre"},
157
- "node-1": {"deployment": "checkout"}
158
- }
159
-
160
- Legacy fallback (ANTIATROPOS_NODE_DEPLOYMENT_MAP):
161
- {
162
- "node-0": "payments",
163
- "node-1": "checkout"
164
- }
165
- """
166
- raw = os.getenv("ANTIATROPOS_WORKLOAD_MAP", "")
167
- if raw:
168
- parsed = self._parse_json_mapping(raw)
169
- if parsed is not None:
170
- return parsed
171
-
172
- legacy_raw = os.getenv("ANTIATROPOS_NODE_DEPLOYMENT_MAP", "")
173
- if legacy_raw:
174
- legacy = self._parse_legacy_mapping(legacy_raw)
175
- if legacy is not None:
176
- return legacy
177
-
178
- return {}
179
-
180
- def _parse_json_mapping(self, raw: str) -> Optional[dict[str, dict[str, str]]]:
181
- try:
182
- data = json.loads(raw)
183
- except json.JSONDecodeError:
184
- return None
185
-
186
- if not isinstance(data, dict):
187
- return None
188
-
189
- out: dict[str, dict[str, str]] = {}
190
- for node_id, workload in data.items():
191
- if not isinstance(workload, dict):
192
- return None
193
- deployment = workload.get("deployment")
194
- if not deployment:
195
- return None
196
- namespace = workload.get("namespace", self.namespace)
197
- out[str(node_id)] = {
198
- "deployment": str(deployment),
199
- "namespace": str(namespace),
200
- }
201
- return out
202
-
203
- def _parse_legacy_mapping(self, raw: str) -> Optional[dict[str, dict[str, str]]]:
204
- try:
205
- data = json.loads(raw)
206
- except json.JSONDecodeError:
207
- return None
208
-
209
- if not isinstance(data, dict):
210
- return None
211
-
212
- out: dict[str, dict[str, str]] = {}
213
- for node_id, deployment in data.items():
214
- if not deployment:
215
- return None
216
- out[str(node_id)] = {
217
- "deployment": str(deployment),
218
- "namespace": self.namespace,
219
- }
220
- return out
221
-
222
- def _resolve_workload_target(self, target: str) -> tuple[str, str]:
223
- if target not in self._node_workload_map:
224
- raise ValueError(
225
- f"Missing workload mapping for target '{target}'. "
226
- "Set ANTIATROPOS_WORKLOAD_MAP with node->deployment bindings."
227
- )
228
-
229
- workload = self._node_workload_map[target]
230
- return workload["namespace"], workload["deployment"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import logging
5
+ import requests
6
+ from uuid import uuid4
7
+ from typing import Optional
8
+
9
+ logger = logging.getLogger("kubernetes_executor")
10
+
11
+ class KubernetesExecutor:
12
+ """
13
+ Executes high-level SRE actions on a Kubernetes cluster.
14
+ Provides a safe layer between SREAgent and actual infrastructure.
15
+ """
16
+ def __init__(self, kubeconfig: Optional[str] = None):
17
+ # Use provided path or env var, defaulting to mock if neither is found
18
+ self.kubeconfig = kubeconfig or os.getenv("KUBECONFIG")
19
+ self.remote_control_url = os.getenv("ANTIATROPOS_CONTROL_PLANE_URL", "").strip().rstrip("/")
20
+ self.remote_timeout_s = float(os.getenv("ANTIATROPOS_CONTROL_TIMEOUT_S", "5.0"))
21
+ self.remote_retry_count = int(os.getenv("ANTIATROPOS_CONTROL_RETRY_COUNT", "2"))
22
+ self.remote_retry_backoff_s = float(os.getenv("ANTIATROPOS_CONTROL_RETRY_BACKOFF_S", "0.25"))
23
+ self.is_mock = (
24
+ not self.remote_control_url
25
+ and (not self.kubeconfig or self.kubeconfig.lower() == "mock")
26
+ )
27
+ self.namespace = os.getenv("ANTIATROPOS_K8S_NAMESPACE", "default")
28
+ self.min_replicas = int(os.getenv("ANTIATROPOS_MIN_REPLICAS", "1"))
29
+ self.max_replicas = self._parse_max_replicas(os.getenv("ANTIATROPOS_MAX_REPLICAS"))
30
+ self.scale_step = int(os.getenv("ANTIATROPOS_SCALE_STEP", "3"))
31
+ self._apps_v1_api = None
32
+ self._node_workload_map = self._load_node_workload_map()
33
+ self._live_supported_actions = {"NO_OP", "SCALE_UP", "SCALE_DOWN"}
34
+ self.k8s_retry_count = int(os.getenv("ANTIATROPOS_K8S_RETRY_COUNT", "2"))
35
+ self.k8s_retry_backoff_s = float(os.getenv("ANTIATROPOS_K8S_RETRY_BACKOFF_S", "0.2"))
36
+
37
+ @staticmethod
38
+ def _parse_max_replicas(raw: Optional[str]) -> Optional[int]:
39
+ """
40
+ Parse optional max replicas.
41
+
42
+ Returns:
43
+ - int when a positive explicit cap is provided
44
+ - None when scale-up should be unbounded
45
+ """
46
+ if raw is None:
47
+ return None
48
+ value = str(raw).strip().lower()
49
+ if value in ("", "none", "unbounded", "inf", "infinite"):
50
+ return None
51
+ try:
52
+ parsed = int(value)
53
+ except ValueError:
54
+ return None
55
+ if parsed <= 0:
56
+ return None
57
+ return parsed
58
+
59
+ @staticmethod
60
+ def _normalize_action_type(action_type) -> str:
61
+ if hasattr(action_type, "value"):
62
+ return str(action_type.value)
63
+ return str(action_type)
64
+
65
+ def execute(self, action_type: str, target: str, parameter: float) -> str:
66
+ """
67
+ Translates SRE actions to Kube requests (ScaleDeployment, PatchIngress, etc.)
68
+ """
69
+ return self.execute_with_metadata(action_type, target, parameter)["ack_status"]
70
+
71
+ def execute_with_metadata(self, action_type: str, target: str, parameter: float) -> dict:
72
+ """
73
+ Execute action and return acknowledgement plus executor metadata.
74
+ """
75
+ action_id = str(uuid4())
76
+ started = time.perf_counter()
77
+ ack_status = ""
78
+ error_code = ""
79
+
80
+ if self.is_mock:
81
+ ack_status = self._mock_execution(action_type, target, parameter)
82
+ else:
83
+ try:
84
+ ack_status = self._real_execution(action_type, target, parameter)
85
+ except Exception as e:
86
+ logger.error(f"Execution failed for {action_type} on {target}: {str(e)}")
87
+ ack_status = f"Error: Failed to execute {action_type} on {target}: {str(e)}"
88
+ error_code = "EXECUTION_ERROR"
89
+
90
+ if ack_status.startswith("Rejected:") and not error_code:
91
+ error_code = "REJECTED_ACTION"
92
+ elif ack_status.startswith("Error:") and not error_code:
93
+ error_code = "EXECUTION_ERROR"
94
+
95
+ latency_ms = (time.perf_counter() - started) * 1000.0
96
+ return {
97
+ "action_id": action_id,
98
+ "ack_status": ack_status,
99
+ "executor_latency_ms": latency_ms,
100
+ "executor_error_code": error_code,
101
+ }
102
+
103
+ def live_enabled_actions(self) -> set[str]:
104
+ """Action types that are actually executable in real live mode."""
105
+ if self.is_mock:
106
+ return {"NO_OP"}
107
+ return set(self._live_supported_actions)
108
+
109
+ def live_capability_error(self, action_type: str) -> Optional[str]:
110
+ """Returns reason when action is not runnable in live mode, else None."""
111
+ action = self._normalize_action_type(action_type)
112
+ if action not in self.live_enabled_actions():
113
+ if self.is_mock:
114
+ return (
115
+ f"Live mode rejected {action}: no real Kubernetes executor is configured "
116
+ "(set KUBECONFIG and ANTIATROPOS_WORKLOAD_MAP)."
117
+ )
118
+ return f"Live mode rejected {action}: no executor is enabled for this action."
119
+ return None
120
+
121
+ def _real_execution(self, action_type: str, target: str, parameter: float) -> str:
122
+ """Execute bounded actions on a Kubernetes cluster."""
123
+ action = self._normalize_action_type(action_type)
124
+
125
+ if self.remote_control_url:
126
+ return self._remote_execution(action, target, parameter)
127
+
128
+ if action == "NO_OP":
129
+ return "Ack: NO_OP - no cluster mutation"
130
+
131
+ if action in ("SCALE_UP", "SCALE_DOWN"):
132
+ return self._scale_deployment(action, target, parameter)
133
+
134
+ return f"Rejected: {action} is not enabled for live Kubernetes execution"
135
+
136
+ def _mock_execution(self, action_type: str, target: str, parameter: float) -> str:
137
+ """Returns mock acknowledgement for actions."""
138
+ # TODO: Add realistic latency simulation for K8s control plane
139
+ action = self._normalize_action_type(action_type)
140
+ return f"Ack: {action} for {target} with value {parameter} - Status: Applied"
141
+
142
+ def _scale_deployment(self, action_type: str, target: str, parameter: float) -> str:
143
+ namespace, deployment_name = self._resolve_workload_target(target)
144
+ apps_v1 = self._get_apps_v1_api()
145
+
146
+ scale_obj = apps_v1.read_namespaced_deployment_scale(
147
+ name=deployment_name,
148
+ namespace=namespace,
149
+ )
150
+
151
+ current = int(scale_obj.spec.replicas or self.min_replicas)
152
+ delta = max(1, int(float(parameter) * self.scale_step))
153
+ if action_type == "SCALE_UP":
154
+ if self.max_replicas is None:
155
+ desired = current + delta
156
+ else:
157
+ desired = min(self.max_replicas, current + delta)
158
+ else:
159
+ desired = max(self.min_replicas, current - delta)
160
+
161
+ if desired == current:
162
+ upper = "unbounded" if self.max_replicas is None else str(self.max_replicas)
163
+ return (
164
+ f"Ack: {action_type} for {target} - replicas unchanged at {current} "
165
+ f"(bounds {self.min_replicas}-{upper})"
166
+ )
167
+
168
+ self._patch_deployment_scale_with_retry(
169
+ apps_v1=apps_v1,
170
+ deployment_name=deployment_name,
171
+ namespace=namespace,
172
+ desired=desired,
173
+ )
174
+
175
+ return (
176
+ f"Ack: {action_type} for {target} - deployment {deployment_name} "
177
+ f"in namespace {namespace} scaled {current}->{desired}"
178
+ )
179
+
180
+ def _patch_deployment_scale_with_retry(self, apps_v1, deployment_name: str, namespace: str, desired: int) -> None:
181
+ """
182
+ Patch deployment replicas with retries for transient API server errors.
183
+ """
184
+ from kubernetes.client.rest import ApiException
185
+
186
+ max_attempts = max(1, self.k8s_retry_count + 1)
187
+ for attempt in range(1, max_attempts + 1):
188
+ try:
189
+ apps_v1.patch_namespaced_deployment_scale(
190
+ name=deployment_name,
191
+ namespace=namespace,
192
+ body={"spec": {"replicas": desired}},
193
+ )
194
+ return
195
+ except ApiException as exc:
196
+ retryable = exc.status in (409, 429, 500, 502, 503, 504)
197
+ if (not retryable) or attempt >= max_attempts:
198
+ raise
199
+ sleep_s = self.k8s_retry_backoff_s * (2 ** (attempt - 1))
200
+ logger.warning(
201
+ "Retrying deployment scale patch after ApiException status=%s attempt=%s/%s",
202
+ exc.status,
203
+ attempt,
204
+ max_attempts,
205
+ )
206
+ time.sleep(sleep_s)
207
+
208
+ def _remote_execution(self, action: str, target: str, parameter: float) -> str:
209
+ """
210
+ Delegate action execution to a remote FastAPI control plane.
211
+
212
+ Expected remote endpoint contract:
213
+ - POST /step
214
+ - Request: {action_type, target_node_id, parameter}
215
+ - Success response includes ack_status and starts with "Ack:"
216
+
217
+ This contract matches server.local_laptop_control and is the only
218
+ supported remote control-plane format.
219
+ """
220
+ if not self.remote_control_url:
221
+ raise ValueError("ANTIATROPOS_CONTROL_PLANE_URL is not configured")
222
+
223
+ endpoint = f"{self.remote_control_url}/step"
224
+ action_payload = {
225
+ "action_type": action,
226
+ "target_node_id": target,
227
+ "parameter": float(parameter),
228
+ }
229
+ payload = action_payload
230
+
231
+ response = self._post_with_retry(endpoint=endpoint, payload=payload)
232
+
233
+ if response.status_code >= 400:
234
+ detail = ""
235
+ try:
236
+ body = response.json()
237
+ detail = str(body.get("detail", body))
238
+ except Exception:
239
+ detail = response.text.strip()
240
+ if response.status_code == 422 and "action" in detail:
241
+ detail = (
242
+ f"{detail}. Expected lightweight control-plane contract at "
243
+ f"{endpoint}: "
244
+ '{"action_type":"SCALE_UP","target_node_id":"node-0","parameter":1.0}'
245
+ )
246
+ raise RuntimeError(
247
+ f"Remote control-plane rejected action ({response.status_code}): {detail}"
248
+ )
249
+
250
+ try:
251
+ data = response.json()
252
+ except Exception as exc:
253
+ raise RuntimeError("Remote control-plane returned non-JSON response") from exc
254
+
255
+ ack = str(data.get("ack_status", "")).strip()
256
+ if not ack:
257
+ action_id = str(data.get("action_id", "")).strip() or "remote"
258
+ return f"Ack: {action} for {target} via remote control-plane ({action_id})"
259
+ return ack
260
+
261
+ def _post_with_retry(self, endpoint: str, payload: dict) -> requests.Response:
262
+ """
263
+ POST helper with retries for transient HTTP/network failures.
264
+ """
265
+ max_attempts = max(1, self.remote_retry_count + 1)
266
+ last_exc: Optional[Exception] = None
267
+
268
+ for attempt in range(1, max_attempts + 1):
269
+ try:
270
+ response = requests.post(endpoint, json=payload, timeout=self.remote_timeout_s)
271
+ except requests.RequestException as exc:
272
+ last_exc = exc
273
+ if attempt >= max_attempts:
274
+ break
275
+ sleep_s = self.remote_retry_backoff_s * (2 ** (attempt - 1))
276
+ logger.warning(
277
+ "Retrying remote control-plane POST after network error attempt=%s/%s: %s",
278
+ attempt,
279
+ max_attempts,
280
+ exc,
281
+ )
282
+ time.sleep(sleep_s)
283
+ continue
284
+
285
+ if response.status_code >= 500 and attempt < max_attempts:
286
+ sleep_s = self.remote_retry_backoff_s * (2 ** (attempt - 1))
287
+ logger.warning(
288
+ "Retrying remote control-plane POST after HTTP %s attempt=%s/%s",
289
+ response.status_code,
290
+ attempt,
291
+ max_attempts,
292
+ )
293
+ time.sleep(sleep_s)
294
+ continue
295
+
296
+ return response
297
+
298
+ if last_exc is not None:
299
+ raise RuntimeError(f"Remote control-plane request failed: {last_exc}") from last_exc
300
+ raise RuntimeError("Remote control-plane request failed after retries")
301
+
302
+ def _get_apps_v1_api(self):
303
+ if self._apps_v1_api is not None:
304
+ return self._apps_v1_api
305
+
306
+ from kubernetes import client, config
307
+
308
+ if self.kubeconfig and self.kubeconfig.lower() not in ("mock", ""):
309
+ config.load_kube_config(config_file=self.kubeconfig)
310
+ else:
311
+ config.load_incluster_config()
312
+
313
+ self._apps_v1_api = client.AppsV1Api()
314
+ return self._apps_v1_api
315
+
316
+ def _load_node_workload_map(self) -> dict[str, dict[str, str]]:
317
+ """
318
+ Load node->workload mapping.
319
+
320
+ Preferred format (ANTIATROPOS_WORKLOAD_MAP):
321
+ {
322
+ "node-0": {"deployment": "payments", "namespace": "prod-sre"},
323
+ "node-1": {"deployment": "checkout"}
324
+ }
325
+
326
+ Legacy fallback (ANTIATROPOS_NODE_DEPLOYMENT_MAP):
327
+ {
328
+ "node-0": "payments",
329
+ "node-1": "checkout"
330
+ }
331
+ """
332
+ raw = os.getenv("ANTIATROPOS_WORKLOAD_MAP", "")
333
+ if raw:
334
+ parsed = self._parse_json_mapping(raw)
335
+ if parsed is not None:
336
+ return parsed
337
+
338
+ legacy_raw = os.getenv("ANTIATROPOS_NODE_DEPLOYMENT_MAP", "")
339
+ if legacy_raw:
340
+ legacy = self._parse_legacy_mapping(legacy_raw)
341
+ if legacy is not None:
342
+ return legacy
343
+
344
+ return {}
345
+
346
+ def _parse_json_mapping(self, raw: str) -> Optional[dict[str, dict[str, str]]]:
347
+ try:
348
+ data = json.loads(raw)
349
+ except json.JSONDecodeError:
350
+ return None
351
+
352
+ if not isinstance(data, dict):
353
+ return None
354
+
355
+ out: dict[str, dict[str, str]] = {}
356
+ for node_id, workload in data.items():
357
+ if not isinstance(workload, dict):
358
+ return None
359
+ deployment = workload.get("deployment")
360
+ if not deployment:
361
+ return None
362
+ namespace = workload.get("namespace", self.namespace)
363
+ out[str(node_id)] = {
364
+ "deployment": str(deployment),
365
+ "namespace": str(namespace),
366
+ }
367
+ return out
368
+
369
+ def _parse_legacy_mapping(self, raw: str) -> Optional[dict[str, dict[str, str]]]:
370
+ try:
371
+ data = json.loads(raw)
372
+ except json.JSONDecodeError:
373
+ return None
374
+
375
+ if not isinstance(data, dict):
376
+ return None
377
+
378
+ out: dict[str, dict[str, str]] = {}
379
+ for node_id, deployment in data.items():
380
+ if not deployment:
381
+ return None
382
+ out[str(node_id)] = {
383
+ "deployment": str(deployment),
384
+ "namespace": self.namespace,
385
+ }
386
+ return out
387
+
388
+ def _resolve_workload_target(self, target: str) -> tuple[str, str]:
389
+ if target not in self._node_workload_map:
390
+ raise ValueError(
391
+ f"Missing workload mapping for target '{target}'. "
392
+ "Set ANTIATROPOS_WORKLOAD_MAP with node->deployment bindings."
393
+ )
394
+
395
+ workload = self._node_workload_map[target]
396
+ return workload["namespace"], workload["deployment"]
control/validation.py CHANGED
@@ -1,38 +1,69 @@
1
- from typing import List, Optional
2
-
3
- class ActionValidator:
4
- """
5
- Validates SRE actions to ensure they stay within safety boundaries.
6
- Prevents destructive operations like 100% shedding on critical nodes.
7
- """
8
- def __init__(self, critical_nodes: Optional[List[str]] = None):
9
- self.critical_nodes = critical_nodes or ["node-0", "node-1", "node-2"]
10
-
11
- def validate(self, action_type: str, target: str, parameter: float, valid_targets: Optional[List[str]] = None) -> (bool, str):
12
- """
13
- Returns (is_valid, error_message).
14
- """
15
- if hasattr(action_type, "value"):
16
- action = str(action_type.value)
17
- else:
18
- action = str(action_type)
19
-
20
- if valid_targets is not None and target not in valid_targets:
21
- return False, f"Unknown target node: {target}"
22
-
23
- if action == "SHED_LOAD" and target in self.critical_nodes:
24
- return False, f"Forbidden: Load shedding on critical node {target}."
25
-
26
- if action in ["SCALE_UP", "SCALE_DOWN"]:
27
- if parameter < 0.0:
28
- return False, "Negative scaling parameters are not allowed."
29
- if parameter > 10.0:
30
- return False, "Scaling parameter must be <= 10.0."
31
-
32
- if action in ["REROUTE_TRAFFIC", "SHED_LOAD"] and not (0.0 <= parameter <= 1.0):
33
- return False, f"{action} parameter must be in [0.0, 1.0]."
34
-
35
- if action == "NO_OP" and parameter != 0.0:
36
- return False, "NO_OP requires parameter=0.0."
37
-
38
- return True, "Success"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple
2
+
3
+
4
+ class ActionValidator:
5
+ """
6
+ Validates SRE actions to ensure they stay within safety boundaries.
7
+ Prevents destructive operations like 100% shedding on critical nodes.
8
+
9
+ Implements soft cooldown for scaling actions: instead of hard-rejecting
10
+ a rapid re-scale, the action passes with a penalty signal. The environment
11
+ can use this penalty to reduce the reward, teaching the agent to wait
12
+ without blocking emergency scaling.
13
+ """
14
+ def __init__(self, critical_nodes: Optional[List[str]] = None, cooldown_ticks: int = 3):
15
+ self.critical_nodes = critical_nodes or ["node-0", "node-1", "node-2"]
16
+ self.cooldown_ticks = cooldown_ticks
17
+ # Track last scale action per node: {node_id: (tick, action_type)}
18
+ self._last_scale: dict[str, Tuple[int, str]] = {}
19
+ self._current_tick: int = 0
20
+
21
+ def set_tick(self, tick: int) -> None:
22
+ """Update the current tick counter for cooldown tracking."""
23
+ self._current_tick = tick
24
+
25
+ def validate(self, action_type: str, target: str, parameter: float, valid_targets: Optional[List[str]] = None) -> Tuple[bool, str, float]:
26
+ """
27
+ Returns (is_valid, error_message, cooldown_penalty).
28
+
29
+ cooldown_penalty is in [0, 1]:
30
+ 0.0 = no penalty (action is fine)
31
+ >0 = soft penalty for rapid re-scaling (action still executes)
32
+ Hard violations (critical shed, out-of-range) still reject with penalty=0.
33
+ """
34
+ if hasattr(action_type, "value"):
35
+ action = str(action_type.value)
36
+ else:
37
+ action = str(action_type)
38
+
39
+ cooldown_penalty = 0.0
40
+
41
+ if valid_targets is not None and target not in valid_targets:
42
+ return False, f"Unknown target node: {target}", 0.0
43
+
44
+ if action == "SHED_LOAD" and target in self.critical_nodes:
45
+ return False, f"Forbidden: Load shedding on critical node {target}.", 0.0
46
+
47
+ if action in ["SCALE_UP", "SCALE_DOWN"]:
48
+ if parameter < 0.0:
49
+ return False, "Negative scaling parameters are not allowed.", 0.0
50
+ if parameter > 10.0:
51
+ return False, "Scaling parameter must be <= 10.0.", 0.0
52
+
53
+ # Soft cooldown: penalize but don't block rapid re-scaling.
54
+ # Dynamic window: if the node is DEGRADED, reduce cooldown (emergency allowed).
55
+ last_tick, last_action = self._last_scale.get(target, (0, ""))
56
+ ticks_since = self._current_tick - last_tick
57
+ if ticks_since < self.cooldown_ticks and last_action == action:
58
+ # Penalty decays linearly: full penalty at 0 ticks, 0 at cooldown_ticks
59
+ cooldown_penalty = (self.cooldown_ticks - ticks_since) / self.cooldown_ticks
60
+ # Don't reject — just flag the penalty
61
+ self._last_scale[target] = (self._current_tick, action)
62
+
63
+ if action in ["REROUTE_TRAFFIC", "SHED_LOAD"] and not (0.0 <= parameter <= 1.0):
64
+ return False, f"{action} parameter must be in [0.0, 1.0].", 0.0
65
+
66
+ if action == "NO_OP" and parameter != 0.0:
67
+ return False, "NO_OP requires parameter=0.0.", 0.0
68
+
69
+ return True, "Success", cooldown_penalty
curriculum.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AntiAtropos Curriculum Training.
3
+
4
+ Defines progressive difficulty stages that the agent must pass before advancing.
5
+ Failed stages are retried with higher temperature for exploration.
6
+
7
+ Each stage specifies:
8
+ - task: Which task to run
9
+ - max_steps: Episode length (shorter = easier)
10
+ - pass_threshold: Minimum composite score to advance
11
+ - temperature: Suggest LLM temperature for this stage
12
+ - description: Human-readable label
13
+ """
14
+
15
+ from dataclasses import dataclass
16
+ from typing import List, Optional
17
+
18
+
19
+ @dataclass
20
+ class CurriculumStage:
21
+ """A single stage in the training curriculum."""
22
+ task: str
23
+ max_steps: int
24
+ pass_threshold: float
25
+ temperature: float = 0.0
26
+ description: str = ""
27
+ retries: int = 0 # Number of failed attempts so far
28
+ max_retries: int = 3 # Max retries before advancing anyway
29
+
30
+ @property
31
+ def retry_temperature(self) -> float:
32
+ """Temperature increases with retries to encourage exploration."""
33
+ if self.retries == 0:
34
+ return self.temperature
35
+ # 0.3, 0.6, 0.9 on retries
36
+ return min(1.0, self.temperature + self.retries * 0.3)
37
+
38
+ @property
39
+ def should_skip(self) -> bool:
40
+ """Skip this stage if too many retries."""
41
+ return self.retries >= self.max_retries
42
+
43
+
44
+ # Progressive curriculum: start easy, add complexity
45
+ CURRICULUM: List[CurriculumStage] = [
46
+ CurriculumStage(
47
+ task="task-1", max_steps=40, pass_threshold=0.40,
48
+ temperature=0.0, description="Short ramp — learn basic scaling",
49
+ ),
50
+ CurriculumStage(
51
+ task="task-1", max_steps=60, pass_threshold=0.50,
52
+ temperature=0.0, description="Standard ramp — scale proactively",
53
+ ),
54
+ CurriculumStage(
55
+ task="task-1", max_steps=100, pass_threshold=0.55,
56
+ temperature=0.0, description="Full ramp — cost-aware scaling",
57
+ ),
58
+ CurriculumStage(
59
+ task="task-2", max_steps=40, pass_threshold=0.35,
60
+ temperature=0.0, description="Short fault — learn reroute/scale on failure",
61
+ ),
62
+ CurriculumStage(
63
+ task="task-2", max_steps=60, pass_threshold=0.45,
64
+ temperature=0.3, description="Standard fault — fast recovery",
65
+ ),
66
+ CurriculumStage(
67
+ task="task-3", max_steps=40, pass_threshold=0.35,
68
+ temperature=0.0, description="Short surge — protect VIP during spike",
69
+ ),
70
+ CurriculumStage(
71
+ task="task-3", max_steps=60, pass_threshold=0.45,
72
+ temperature=0.3, description="Standard surge — sustained VIP protection",
73
+ ),
74
+ # Final combined test
75
+ CurriculumStage(
76
+ task="task-1", max_steps=100, pass_threshold=0.55,
77
+ temperature=0.0, description="Final: full ramp at low temp",
78
+ ),
79
+ CurriculumStage(
80
+ task="task-2", max_steps=60, pass_threshold=0.50,
81
+ temperature=0.0, description="Final: fault recovery at low temp",
82
+ ),
83
+ CurriculumStage(
84
+ task="task-3", max_steps=60, pass_threshold=0.50,
85
+ temperature=0.0, description="Final: surge protection at low temp",
86
+ ),
87
+ ]
88
+
89
+
90
+ class CurriculumTracker:
91
+ """Tracks progress through the curriculum stages."""
92
+
93
+ def __init__(self, stages: Optional[List[CurriculumStage]] = None):
94
+ self._stages = stages or CURRICULUM
95
+ self._current_idx: int = 0
96
+
97
+ @property
98
+ def current(self) -> CurriculumStage:
99
+ return self._stages[self._current_idx]
100
+
101
+ @property
102
+ def current_index(self) -> int:
103
+ return self._current_idx
104
+
105
+ @property
106
+ def total_stages(self) -> int:
107
+ return len(self._stages)
108
+
109
+ @property
110
+ def is_complete(self) -> bool:
111
+ return self._current_idx >= len(self._stages)
112
+
113
+ def report_score(self, score: float) -> bool:
114
+ """Report a score for the current stage. Returns True if passed."""
115
+ if score >= self.current.pass_threshold:
116
+ self._current_idx += 1
117
+ return True
118
+ else:
119
+ self.current.retries += 1
120
+ if self.current.should_skip:
121
+ self._current_idx += 1
122
+ return False
123
+
124
+ def progress_summary(self) -> str:
125
+ stage = self.current
126
+ return (
127
+ f"Stage {self._current_idx + 1}/{self.total_stages}: "
128
+ f"{stage.description} "
129
+ f"(task={stage.task}, max_steps={stage.max_steps}, "
130
+ f"threshold={stage.pass_threshold}, retries={stage.retries})"
131
+ )
deploy-local.ps1 ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AntiAtropos Local Cluster Deploy
2
+ # Deploys workloads, Prometheus, and Grafana on the Kind cluster.
3
+ # Grafana port-forward starts automatically at the end.
4
+
5
+ param(
6
+ [switch]$SkipPortForward,
7
+ [int]$GrafanaPort = 3000
8
+ )
9
+
10
+ Write-Host "=== AntiAtropos Local Deploy ===" -ForegroundColor Cyan
11
+ Write-Host ""
12
+
13
+ # --- 1. Check cluster ---
14
+ Write-Host "[1/5] Checking Kind cluster..." -ForegroundColor Yellow
15
+ $cluster = kubectl config current-context 2>$null
16
+ if ($cluster -notmatch "antiatropos") {
17
+ Write-Host "WARNING: Current context is '$cluster', expected 'kind-antiatropos-local'. Proceed anyway? [Y/n]"
18
+ $r = Read-Host
19
+ if ($r -eq 'n') { exit 1 }
20
+ }
21
+
22
+ # --- 2. Deploy workload pods ---
23
+ Write-Host "[2/5] Deploying workload pods..." -ForegroundColor Yellow
24
+ kubectl create ns prod-sre 2>&1 | Out-Null
25
+ kubectl create ns monitoring 2>&1 | Out-Null
26
+ kubectl apply -f "$PSScriptRoot\deploy\local-laptop.yaml"
27
+ Write-Host " Waiting for workloads to be ready..."
28
+ kubectl wait --for=condition=ready pod -l app --all -n prod-sre --timeout=120s 2>$null
29
+ Write-Host " Workloads ready."
30
+
31
+ # --- 3. Deploy Prometheus ---
32
+ Write-Host "[3/5] Deploying Prometheus..." -ForegroundColor Yellow
33
+ $promRelease = helm list -n monitoring -q 2>$null | Select-String "prometheus"
34
+ if ($promRelease) {
35
+ helm upgrade prometheus prometheus-community/prometheus -n monitoring -f "$PSScriptRoot\deploy\prometheus-helm-values.yaml"
36
+ } else {
37
+ helm install prometheus prometheus-community/prometheus -n monitoring -f "$PSScriptRoot\deploy\prometheus-helm-values.yaml"
38
+ }
39
+ Write-Host " Waiting for Prometheus server..."
40
+ kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=prometheus" -n monitoring --timeout=120s 2>$null
41
+ Write-Host " Prometheus ready."
42
+
43
+ # --- 4. Deploy Grafana ---
44
+ Write-Host "[4/5] Deploying Grafana..." -ForegroundColor Yellow
45
+ # Update dashboard ConfigMap
46
+ kubectl delete configmap grafana-dashboards -n monitoring 2>$null
47
+ kubectl create configmap grafana-dashboards -n monitoring --from-file="$PSScriptRoot\deploy\grafana\provisioning\dashboards\json\"
48
+
49
+ $grafRelease = helm list -n monitoring -q 2>$null | Select-String "grafana"
50
+ if ($grafRelease) {
51
+ helm upgrade grafana grafana/grafana -n monitoring -f "$PSScriptRoot\deploy\grafana-helm-values.yaml"
52
+ } else {
53
+ helm install grafana grafana/grafana -n monitoring -f "$PSScriptRoot\deploy\grafana-helm-values.yaml"
54
+ }
55
+ Write-Host " Waiting for Grafana..."
56
+ kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=grafana" -n monitoring --timeout=120s 2>$null
57
+ Write-Host " Grafana ready."
58
+
59
+ # --- 5. Start Grafana port-forward ---
60
+ Write-Host "[5/5] Grafana port-forward..." -ForegroundColor Yellow
61
+ if (-not $SkipPortForward) {
62
+ # Kill any existing port-forward on the same port
63
+ $existing = Get-NetTCPConnection -LocalPort $GrafanaPort -ErrorAction SilentlyContinue 2>$null
64
+ if ($existing) {
65
+ $pid = $existing.OwningProcess
66
+ Stop-Process -Id $pid -Force -ErrorAction SilentlyContinue 2>$null
67
+ Start-Sleep -Seconds 1
68
+ }
69
+
70
+ Write-Host " Starting port-forward on localhost:$GrafanaPort..."
71
+ $proc = Start-Process -PassThru -NoNewWindow kubectl -ArgumentList "port-forward","-n","monitoring","svc/grafana","${GrafanaPort}:80"
72
+
73
+ Start-Sleep -Seconds 2
74
+ # Verify the port-forward is alive
75
+ try {
76
+ $null = Invoke-WebRequest -Uri "http://localhost:$GrafanaPort/api/health" -UseBasicParsing -TimeoutSec 5
77
+ Write-Host ""
78
+ Write-Host "=== Deploy Complete ===" -ForegroundColor Green
79
+ Write-Host " Grafana: http://localhost:$GrafanaPort (admin / antiatropos)"
80
+ Write-Host " Dashboards: AntiAtropos Overview, AntiAtropos Live Control Plane"
81
+ Write-Host " Port-forward PID: $($proc.Id)"
82
+ Write-Host ""
83
+ Write-Host "To stop port-forward: Stop-Process -Id $($proc.Id)"
84
+ } catch {
85
+ Write-Host "WARNING: Port-forward started but Grafana not reachable yet. Try: kubectl port-forward -n monitoring svc/grafana ${GrafanaPort}:80"
86
+ }
87
+ } else {
88
+ Write-Host ""
89
+ Write-Host "=== Deploy Complete ===" -ForegroundColor Green
90
+ Write-Host " To access Grafana: kubectl port-forward -n monitoring svc/grafana ${GrafanaPort}:80"
91
+ }
deploy/LOCAL_LAPTOP_FASTAPI_GUIDE.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local Laptop Kubernetes Control with FastAPI
2
+
3
+ This guide uses your local manifest [deploy/local-laptop.yaml](deploy/local-laptop.yaml) and a lightweight server [server/local_laptop_control.py](server/local_laptop_control.py).
4
+
5
+ ## 1) Deploy local workloads
6
+
7
+ ```powershell
8
+ kubectl apply -f deploy/local-laptop.yaml
9
+ kubectl get deploy -n prod-sre
10
+ ```
11
+
12
+ Expected deployments:
13
+ - `auth`
14
+ - `cart`
15
+ - `catalog`
16
+ - `checkout`
17
+ - `payments`
18
+
19
+ ## 2) Set required environment variables
20
+
21
+ The controller requires `KUBECONFIG` and `ANTIATROPOS_WORKLOAD_MAP`.
22
+
23
+ ```powershell
24
+ $env:KUBECONFIG = "$HOME/.kube/config"
25
+ $env:ANTIATROPOS_K8S_NAMESPACE = "prod-sre"
26
+ $env:ANTIATROPOS_MIN_REPLICAS = "1"
27
+ $env:ANTIATROPOS_MAX_REPLICAS = "" # empty => unbounded scale-up
28
+ $env:ANTIATROPOS_SCALE_STEP = "3"
29
+ $env:ANTIATROPOS_WORKLOAD_MAP = '{"node-0":{"deployment":"payments","namespace":"prod-sre"},"node-1":{"deployment":"checkout","namespace":"prod-sre"},"node-2":{"deployment":"catalog","namespace":"prod-sre"},"node-3":{"deployment":"cart","namespace":"prod-sre"},"node-4":{"deployment":"auth","namespace":"prod-sre"}}'
30
+ ```
31
+
32
+ If you already have these in [.env](.env), load them first.
33
+
34
+ ## 3) Start lightweight FastAPI server
35
+
36
+ ```powershell
37
+ uvicorn server.local_laptop_control:app --host 0.0.0.0 --port 8010
38
+ ```
39
+
40
+ ## 4) Validate server health
41
+
42
+ ```powershell
43
+ Invoke-RestMethod http://localhost:8010/health
44
+ ```
45
+
46
+ Check:
47
+ - `is_mock` should be `False`
48
+ - `mapped_targets` should include `node-0`..`node-4`
49
+
50
+ ## 5) Let your agent execute actions
51
+
52
+ The server accepts `POST /step` with:
53
+ - `action_type`: `NO_OP` | `SCALE_UP` | `SCALE_DOWN`
54
+ - `target_node_id`: `node-*`
55
+ - `parameter`: float
56
+
57
+ Example:
58
+
59
+ ```powershell
60
+ Invoke-RestMethod -Method Post -Uri http://localhost:8010/step -ContentType "application/json" -Body '{"action_type":"SCALE_UP","target_node_id":"node-3","parameter":0.6}'
61
+ ```
62
+
63
+ ## 6) Verify Kubernetes effect
64
+
65
+ ```powershell
66
+ kubectl get deploy cart -n prod-sre
67
+ kubectl get deploy -n prod-sre
68
+ ```
69
+
70
+ ## Notes
71
+
72
+ - This controller is intentionally minimal and does not provide simulator rewards.
73
+ - It is suitable for direct action execution tests from your agent.
74
+ - If you need OpenEnv-compatible `/reset` + `/step` + reward loop, use [server/app.py](server/app.py) in `aws` mode.
deploy/aws/ARCHITECTURE.md ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AntiAtropos Architecture Guide
2
+
3
+ A complete explanation of how AntiAtropos works across Hugging Face Spaces and AWS, written for someone who is technically strong but new to Kubernetes.
4
+
5
+ ---
6
+
7
+ ## The Big Picture
8
+
9
+ AntiAtropos trains AI agents to be Site Reliability Engineers (SREs). An SRE agent watches a simulated microservice cluster and decides when to scale services, reroute traffic, or shed load to keep things running smoothly.
10
+
11
+ The system is split across two platforms:
12
+
13
+ ```
14
+ Hugging Face Spaces AWS
15
+ ===================== ======================
16
+ The "brain" The "muscle"
17
+
18
+ AntiAtropos FastAPI server EKS (Kubernetes cluster)
19
+ - Runs the simulator - Runs the actual microservice pods
20
+ - Runs the SRE agent logic - The agent scales these pods
21
+ - Queries Prometheus for metrics - Prometheus Agent scrapes metrics
22
+ - Sends scale commands to K8s - Metrics flow to AMP
23
+ - Grafana (AMG) visualizes it all
24
+ ```
25
+
26
+ Why split? HF Spaces is free/cheap for running the Python server. AWS EKS is where the real infrastructure lives that the agent practices on.
27
+
28
+ ---
29
+
30
+ ## Kubernetes Concepts You Need
31
+
32
+ ### Pod
33
+
34
+ The smallest unit in Kubernetes. A pod is one or more containers that run together. In our case, each pod runs a single nginx container that simulates a microservice (like "payments" or "checkout").
35
+
36
+ Think of it as: one running instance of a service.
37
+
38
+ ### Deployment
39
+
40
+ A Deployment is a recipe that tells Kubernetes "keep N copies of this pod running at all times." If a pod dies, the Deployment automatically replaces it.
41
+
42
+ The key field is `spec.replicas` — this is the number the SRE agent changes when it scales a service up or down.
43
+
44
+ ```
45
+ Deployment: payments
46
+ replicas: 3 <-- the agent changes this number
47
+ |
48
+ +-- Pod: payments-abc123 (running)
49
+ +-- Pod: payments-def456 (running)
50
+ +-- Pod: payments-ghi789 (running)
51
+ ```
52
+
53
+ **The agent scales replicas, not pods.** When it sets `replicas: 5`, Kubernetes creates 5 pods. When it sets `replicas: 2`, Kubernetes kills 3 pods.
54
+
55
+ ### Service
56
+
57
+ A Service gives pods a stable network name. Instead of connecting to `payments-abc123` directly (which changes when the pod is recreated), you connect to `payments` (the Service), which routes to whichever pods are healthy.
58
+
59
+ ### Namespace
60
+
61
+ A namespace is a folder for organizing resources. We use:
62
+ - `prod-sre` — where the 5 microservice Deployments live
63
+ - `monitoring` — where the Prometheus Agent pod lives
64
+ - `kube-system` — where AWS/EKS system pods live
65
+
66
+ ### Node
67
+
68
+ A node is one EC2 virtual machine in the EKS cluster. Our cluster has 2-4 nodes. Each node runs multiple pods. When all nodes are full and the agent wants to scale up, Kubernetes adds more nodes (up to `maxSize: 4` in our config).
69
+
70
+ ```
71
+ EKS Cluster
72
+ Node 1 (t3.medium - 4 vCPU, 8GB RAM)
73
+ Pod: payments-abc123
74
+ Pod: checkout-def456
75
+ Pod: catalog-ghi789
76
+ Pod: prometheus-agent-xyz
77
+ Node 2 (t3.medium - 4 vCPU, 8GB RAM)
78
+ Pod: payments-jkl012 <-- agent scaled payments from 1 to 2
79
+ Pod: cart-mno345
80
+ Pod: auth-pqr678
81
+ ```
82
+
83
+ ### ResourceQuota
84
+
85
+ A hard limit on how many resources a namespace can use. We set one on `prod-sre` that caps total pods at 30. This is a safety net — even if the Python code cap fails, Kubernetes itself will refuse to create more than 30 pods.
86
+
87
+ ---
88
+
89
+ ## How the SRE Agent Works
90
+
91
+ ### The Loop
92
+
93
+ Every "tick" (one step of the simulation), the agent goes through this cycle:
94
+
95
+ ```
96
+ 1. OBSERVE -- Read telemetry (CPU, latency, queue depth) from Prometheus
97
+ 2. DECIDE -- Choose an action (SCALE_UP, SCALE_DOWN, REROUTE_TRAFFIC, SHED_LOAD, NO_OP)
98
+ 3. ACT -- Send the action to KubernetesExecutor
99
+ 4. REWARD -- Compute Lyapunov stability reward (was the cluster more or less stable?)
100
+ 5. REPEAT
101
+ ```
102
+
103
+ ### How Each Action Works
104
+
105
+ | Action | What the Agent Decides | What Happens on EKS |
106
+ |---|---|---|
107
+ | `SCALE_UP` | "node-0 needs more capacity" | `KubernetesExecutor` patches `payments` Deployment: `replicas: 2 -> 5` |
108
+ | `SCALE_DOWN` | "node-3 is over-provisioned" | `KubernetesExecutor` patches `cart` Deployment: `replicas: 4 -> 1` |
109
+ | `REROUTE_TRAFFIC` | "Move traffic away from node-2" | Currently simulation-only (no live K8s ingress patching) |
110
+ | `SHED_LOAD` | "Drop 50% of traffic to node-3" | Currently simulation-only (no live K8s traffic shaping) |
111
+ | `NO_OP` | "Do nothing this tick" | Nothing changes on EKS |
112
+
113
+ ### The SCALE_UP Flow in Detail
114
+
115
+ Here is exactly what happens when the agent decides to scale up `node-0` (the payments service):
116
+
117
+ ```
118
+ HF Spaces AWS EKS
119
+ ---------- --------
120
+
121
+ Agent: "SCALE_UP, node-0, parameter=0.5"
122
+ |
123
+ v
124
+ AntiAtroposEnvironment.step()
125
+ |
126
+ v
127
+ KubernetesExecutor.execute_with_metadata()
128
+ |
129
+ v
130
+ _load_node_workload_map()
131
+ reads: node-0 -> {"deployment": "payments", "namespace": "prod-sre"}
132
+ |
133
+ v
134
+ _scale_deployment("SCALE_UP", "node-0", 0.5)
135
+ |
136
+ +-- 1. Read current replicas: apps_v1.read_namespaced_deployment_scale("payments", "prod-sre")
137
+ | Current replicas = 2
138
+ |
139
+ +-- 2. Calculate delta: max(1, int(0.5 * 3)) = 1
140
+ | Desired = min(6, 2 + 1) = 3 <-- max_replicas cap from env var
141
+ |
142
+ +-- 3. Patch: apps_v1.patch_namespaced_deployment_scale("payments", "prod-sre",
143
+ | body={"spec": {"replicas": 3}})
144
+ |
145
+ v +---------------------------+
146
+ Returns: "Ack: SCALE_UP for node-0 - | K8s creates 1 new pod: |
147
+ deployment payments in namespace | payments-newpod-xyz |
148
+ prod-sre scaled 2->3" +---------------------------+
149
+ ```
150
+
151
+ ### The Telemetry Flow in Detail
152
+
153
+ How the agent reads metrics from the real cluster:
154
+
155
+ ```
156
+ EKS Cluster AMP HF Spaces
157
+ ----------- --- ----------
158
+
159
+ Workload pods AMP Workspace AntiAtropos
160
+ (payments, checkout...) stores all metrics PrometheusClient
161
+ | ^ |
162
+ | /metrics (scraped every 15s) | |
163
+ v | |
164
+ Prometheus Agent | |
165
+ | | |
166
+ | remote-write (SigV4 auth) | |
167
+ +-------------------------------------------> |
168
+ | |
169
+ | HTTPS query |
170
+ +------------------------>
171
+ (PROMETHEUS_URL env var)
172
+ |
173
+ v
174
+ _fetch_real_metrics()
175
+ runs PromQL like:
176
+ sum(rate(http_requests_total[1m])) by (pod)
177
+ returns: TelemetryRecord for each node
178
+ ```
179
+
180
+ ---
181
+
182
+ ## The Three Layers of Scaling Caps
183
+
184
+ This is the most important thing to understand for cost control. There are **three** independent limits:
185
+
186
+ ### Layer 1: Python Code Cap (Soft)
187
+
188
+ **Where:** `ANTIATROPOS_MAX_REPLICAS` env var on HF Spaces, read by `kubernetes_executor.py` line 18.
189
+
190
+ **How it works:** The `_scale_deployment()` method calculates `desired = min(self.max_replicas, current + delta)`. If the agent tries to scale above 6, it gets:
191
+
192
+ ```
193
+ Ack: SCALE_UP for node-0 - replicas unchanged at 6 (bounds 1-6)
194
+ ```
195
+
196
+ **Can it be bypassed?** Yes. A bug in the code, or someone running `kubectl scale deployment payments --replicas=50` directly.
197
+
198
+ **Set to:** `6` on HF Spaces.
199
+
200
+ ### Layer 2: Kubernetes ResourceQuota (Hard)
201
+
202
+ **Where:** `k8s-workloads.yaml` — ResourceQuota on the `prod-sre` namespace.
203
+
204
+ **How it works:** Kubernetes itself refuses to schedule pods that would exceed the quota. If the namespace already has 30 pods and something tries to create a 31st:
205
+
206
+ ```
207
+ Error from server (Forbidden): pods "payments-new" is forbidden:
208
+ exceeded quota: prod-sre-quota, requested: pods=1, used: pods=30, limited: pods=30
209
+ ```
210
+
211
+ **Can it be bypassed?** Only by someone with cluster-admin access who deletes or edits the ResourceQuota.
212
+
213
+ **Set to:** 30 pods total, 8 CPU, 8GB RAM.
214
+
215
+ ### Layer 3: EKS Node Group Max Size (Hard)
216
+
217
+ **Where:** `eksctl-cluster.yaml` — `managedNodeGroups[0].maxSize: 4`.
218
+
219
+ **How it works:** The Cluster Autoscaler will never add more than 4 nodes. Even if there are 100 pending pods, it stops at 4 nodes. Pending pods just wait.
220
+
221
+ **Can it be bypassed?** Only by someone editing the node group in the AWS console.
222
+
223
+ **Set to:** 4 nodes (4 x t3.medium = 8 vCPU, 16GB RAM max).
224
+
225
+ ### How the Three Layers Work Together
226
+
227
+ ```
228
+ Agent wants to scale all 5 deployments to 20 replicas each:
229
+
230
+ Layer 1 (Python cap): 6 replicas max per deployment -> agent gets "unchanged at 6"
231
+ 5 x 6 = 30 pods maximum
232
+
233
+ Layer 2 (ResourceQuota): 30 pods max in namespace -> 31st pod is Forbidden
234
+
235
+ Layer 3 (Node group): 4 nodes max -> if 30 pods don't fit on 4 nodes,
236
+ some stay Pending (no cost)
237
+
238
+ Worst case with all caps: 30 pods on 4 nodes = ~$160/month
239
+ Without any caps: 100 pods on 25 nodes = ~$1,800/month
240
+ ```
241
+
242
+ ---
243
+
244
+ ## The Mapping: Simulator Nodes to Real Deployments
245
+
246
+ The simulator has 5 abstract nodes (node-0 through node-4). The `ANTIATROPOS_WORKLOAD_MAP` env var tells the system which K8s Deployment each simulator node maps to:
247
+
248
+ ```
249
+ Simulator Node K8s Deployment Namespace Notes
250
+ ------------- --------------- --------- -----
251
+ node-0 payments prod-sre VIP (4x importance weight)
252
+ node-1 checkout prod-sre Critical (no SHED_LOAD)
253
+ node-2 catalog prod-sre Critical (no SHED_LOAD)
254
+ node-3 cart prod-sre Non-critical (sheddable)
255
+ node-4 auth prod-sre Non-critical (sheddable)
256
+ ```
257
+
258
+ When the simulator says "SCALE_UP node-0 by 0.5", the system:
259
+ 1. Looks up node-0 in the workload map -> `payments` in `prod-sre`
260
+ 2. Calls `patch_namespaced_deployment_scale("payments", "prod-sre", ...)`
261
+ 3. Kubernetes creates/destroys pods to match the new replica count
262
+
263
+ ---
264
+
265
+ ## What Runs Where (Complete List)
266
+
267
+ ### On Hugging Face Spaces
268
+
269
+ | Component | What It Does | Port |
270
+ |---|---|---|
271
+ | FastAPI server (`server/app.py`) | HTTP API for the agent | 7860 (via NGINX) |
272
+ | Simulator (`simulator.py`) | 5-node microservice cluster simulation | Internal |
273
+ | PrometheusClient (`telemetry/prometheus_client.py`) | Queries AMP for real metrics | Outbound HTTPS |
274
+ | KubernetesExecutor (`control/kubernetes_executor.py`) | Sends scale commands to EKS | Outbound HTTPS |
275
+ | Prometheus metrics exporter | Serves `/metrics` for HF's monitoring | 8000 |
276
+ | Grafana + local Prometheus | Local dashboards (from the Dockerfile) | 3000, 9090 |
277
+
278
+ ### On AWS EKS
279
+
280
+ | Component | Namespace | What It Does |
281
+ |---|---|---|
282
+ | payments Deployment | prod-sre | 2 nginx pods (scales with agent) |
283
+ | checkout Deployment | prod-sre | 1 nginx pod (scales with agent) |
284
+ | catalog Deployment | prod-sre | 1 nginx pod (scales with agent) |
285
+ | cart Deployment | prod-sre | 1 nginx pod (scales with agent) |
286
+ | auth Deployment | prod-sre | 1 nginx pod (scales with agent) |
287
+ | Prometheus Agent | monitoring | Scrapes workload pods, remote-writes to AMP |
288
+ | Cluster Autoscaler | kube-system | Adds/removes EC2 nodes based on demand |
289
+
290
+ ### On AWS Managed Services
291
+
292
+ | Service | What It Does |
293
+ |---|---|
294
+ | AMP (Amazon Managed Prometheus) | Stores all metrics. Queried by HF Spaces. |
295
+ | AMG (Amazon Managed Grafana) | Visualizes metrics in dashboards. Accessed via browser. |
296
+
297
+ ---
298
+
299
+ ## The Simulator vs Real Cluster
300
+
301
+ AntiAtropos has three modes controlled by `ANTIATROPOS_ENV_MODE`:
302
+
303
+ ### Simulated Mode (`simulated`)
304
+
305
+ Everything is fake. The simulator generates synthetic metrics (random CPU, latency, etc.). No K8s, no Prometheus. The agent practices in a safe sandbox.
306
+
307
+ This is the default on HF Spaces without AWS configured.
308
+
309
+ ### Hybrid Mode (`hybrid`)
310
+
311
+ The simulator runs, but it pulls real metrics from AMP to calibrate itself. If AMP says `payments` pods have 80% CPU, the simulator adjusts its internal model to match. The agent can read real data but actions only affect the simulator, not real pods.
312
+
313
+ ### Live Mode (`live`)
314
+
315
+ The real deal. The agent reads real metrics from AMP and sends real scale commands to EKS. When it says `SCALE_UP`, actual pods get created on actual EC2 instances that cost actual money.
316
+
317
+ **Set `ANTIATROPOS_ENV_MODE=live` on HF Spaces to enable this.**
318
+
319
+ ---
320
+
321
+ ## Cost Flow
322
+
323
+ Every pod on EKS costs money. Here is how costs flow based on the agent's actions:
324
+
325
+ ```
326
+ Agent action: SCALE_UP node-0
327
+ -> payments Deployment: replicas 2 -> 5
328
+ -> 3 new pods created
329
+ -> If existing nodes are full, Cluster Autoscaler adds a node
330
+ -> New node = another t3.medium EC2 instance = ~$0.04/hr
331
+ -> 3 pods running = 3 x (0.1 CPU + 64MB RAM) from the quota
332
+
333
+ Agent action: SCALE_DOWN node-3
334
+ -> cart Deployment: replicas 4 -> 1
335
+ -> 3 pods terminated
336
+ -> If nodes are now underutilized, Cluster Autoscaler removes a node (after 10 min)
337
+ -> One fewer EC2 instance = saves ~$0.04/hr
338
+ ```
339
+
340
+ The Lyapunov reward function penalizes the agent for both instability AND cost, so a well-trained agent should learn to scale efficiently:
341
+
342
+ ```
343
+ R_t = -(alpha * delta_V + beta * cost + gamma * SLA_violation)
344
+ ^^^^
345
+ beta=0.01 penalizes over-provisioning
346
+ ```
347
+
348
+ ---
349
+
350
+ ## Quick Reference: Key Files
351
+
352
+ | File | Purpose |
353
+ |---|---|
354
+ | `kubernetes_executor.py` | Translates agent actions to K8s API calls |
355
+ | `prometheus_client.py` | Queries AMP for real metrics |
356
+ | `simulator.py` | 5-node fluid-queue simulation |
357
+ | `stability.py` | Lyapunov reward computation |
358
+ | `deploy/aws/k8s-workloads.yaml` | The 5 Deployments + ResourceQuota on EKS |
359
+ | `deploy/aws/eksctl-cluster.yaml` | EKS cluster definition (nodes, caps) |
360
+ | `deploy/aws/prometheus-agent-values.yaml` | Helm config for Prometheus Agent |
361
+ | `deploy/aws/generate-kubeconfig.sh` | Creates kubeconfig for HF Spaces |
deploy/aws/FASTAPI_AWS_MODE_GUIDE.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI AWS Mode + Local Grafana Guide
2
+
3
+ This setup keeps Kubernetes + AMP in AWS, while Grafana runs on your laptop.
4
+
5
+ ## 1) Environment file
6
+
7
+ Use [../../.env.example](../../.env.example) as template. A starter [../../.env](../../.env) is already created.
8
+
9
+ Important keys:
10
+
11
+ - `ANTIATROPOS_ENV_MODE=aws`
12
+ - `KUBECONFIG=.../deploy/aws/kubeconfig-antiatropos.yaml`
13
+ - `PROMETHEUS_URL=https://aps-workspaces.<region>.amazonaws.com/workspaces/<workspace-id>`
14
+ - `ANTIATROPOS_WORKLOAD_MAP=...`
15
+ - `ANTIATROPOS_GRAFANA_MODE=external`
16
+
17
+ ## 2) Load .env in PowerShell
18
+
19
+ From workspace root:
20
+
21
+ ```powershell
22
+ Get-Content .env | ForEach-Object {
23
+ if ($_ -match '^\s*#' -or $_ -match '^\s*$') { return }
24
+ $name, $value = $_ -split '=', 2
25
+ [System.Environment]::SetEnvironmentVariable($name, $value, 'Process')
26
+ }
27
+ ```
28
+
29
+ ## 3) Start FastAPI server
30
+
31
+ ```powershell
32
+ uvicorn server.app:app --host 0.0.0.0 --port 8000
33
+ ```
34
+
35
+ ## 4) Verify runtime wiring
36
+
37
+ Check runtime endpoint:
38
+
39
+ - [server/app.py](../../server/app.py) exposes `GET /config/runtime`
40
+ - Example URL: `http://localhost:8000/config/runtime`
41
+
42
+ You should see:
43
+
44
+ - `env_mode: "aws"`
45
+ - `prometheus_url_configured: true`
46
+ - `kubeconfig_configured: true`
47
+ - `workload_map_configured: true`
48
+
49
+ ## 5) Reset environment in AWS mode
50
+
51
+ Use reset with `mode="aws"`, or omit mode and rely on `ANTIATROPOS_ENV_MODE=aws`.
52
+
53
+ ## 6) Run Grafana locally (not in EKS)
54
+
55
+ ```powershell
56
+ docker run -d --name antiatropos-grafana -p 3000:3000 grafana/grafana:latest
57
+ ```
58
+
59
+ Open `http://localhost:3000` and add AMP as Prometheus datasource:
60
+
61
+ - URL: `https://aps-workspaces.<region>.amazonaws.com/workspaces/<workspace-id>`
62
+ - Auth: SigV4 enabled
63
+ - Region: your AWS region (for example `ap-south-1`)
64
+
65
+ Import dashboards:
66
+
67
+ - [../grafana/provisioning/dashboards/json/antiatropos-overview.json](../grafana/provisioning/dashboards/json/antiatropos-overview.json)
68
+ - [../grafana/provisioning/dashboards/json/antiatropos-live.json](../grafana/provisioning/dashboards/json/antiatropos-live.json)
69
+
70
+ ## Notes
71
+
72
+ Grafana is observability-only. Agent control runs via FastAPI + Kubernetes executor.
deploy/aws/OPERATIONS.md ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AntiAtropos AWS Operations Guide
2
+
3
+ Everything you need to run the AWS infrastructure for AntiAtropos without blowing up your bill.
4
+
5
+ **Architecture: FastAPI on Hugging Face Spaces, EKS + AMP + AMG on AWS.**
6
+
7
+ ---
8
+
9
+ ## Table of Contents
10
+
11
+ 1. [Replica Strategy & Caps](#1-replica-strategy--caps)
12
+ 2. [Autoscaling Configuration](#2-autoscaling-configuration)
13
+ 3. [Cost Guardrails](#3-cost-guardrails)
14
+ 4. [Step-by-Step Deployment Walkthrough](#4-step-by-step-deployment-walkthrough)
15
+ 5. [Configuring HF Spaces to Connect to AWS](#5-configuring-hf-spaces-to-connect-to-aws)
16
+ 6. [Day-2 Operations](#6-day-2-operations)
17
+ 7. [Teardown & Cost Recovery](#7-teardown--cost-recovery)
18
+
19
+ ---
20
+
21
+ ## 1. Replica Strategy & Caps
22
+
23
+ ### What Runs Where
24
+
25
+ | Component | Where | Scaled By | Cost Impact |
26
+ |---|---|---|---|
27
+ | **AntiAtropos FastAPI server** | HF Spaces | HF auto-scales | $0-5/month (HF billing) |
28
+ | **Workload pods** (payments, checkout, etc.) | EKS | SRE agent via `KubernetesExecutor` | **HIGH** — this is where costs spiral |
29
+ | **Prometheus Agent** | EKS (monitoring ns) | Static (1 pod) | Low |
30
+ | **AMP** | AWS managed | Serverless | Pay per GB ingested |
31
+ | **AMG** | AWS managed | Serverless | Pay per editor |
32
+
33
+ ### Workload Pod Replicas — Where Costs Spiral
34
+
35
+ The SRE agent's `SCALE_UP` action calls `KubernetesExecutor._scale_deployment()`, which patches `replicas` on real K8s Deployments. A bad agent can scale every deployment to the cap.
36
+
37
+ The `ANTIATROPOS_MAX_REPLICAS` env var (set on HF Spaces) is the **global** ceiling applied to all deployments. The default in `kubernetes_executor.py` is 20 — with 5 deployments, that's **100 pods** worst case. **Set it to 6.**
38
+
39
+ **Recommended caps by deployment:**
40
+
41
+ | Deployment | Min | Max Replicas | Reasoning |
42
+ |---|---|---|---|
43
+ | `payments` (node-0, VIP) | 2 | 6 | VIP node — needs redundancy, 6 is plenty for the traffic model |
44
+ | `checkout` (node-1) | 1 | 5 | Can burst but shouldn't stay high |
45
+ | `catalog` (node-2) | 1 | 5 | Same |
46
+ | `cart` (node-3) | 1 | 4 | Non-critical, sheddable |
47
+ | `auth` (node-4) | 1 | 4 | Non-critical, sheddable |
48
+
49
+ **Total worst case: 24 workload pods.**
50
+
51
+ At ~0.25 vCPU / 256MB per workload pod (nginx containers), that's ~6 vCPU and ~6GB RAM — fits on 2x t3.medium nodes with some headroom, or 3 nodes for comfort.
52
+
53
+ ### How the Cap Works
54
+
55
+ The `KubernetesExecutor._scale_deployment()` method reads `ANTIATROPOS_MAX_REPLICAS` from the environment and refuses to scale above it:
56
+
57
+ ```
58
+ Ack: SCALE_UP for node-0 - replicas unchanged at 6 (bounds 1-6)
59
+ ```
60
+
61
+ This is enforced in code (`kubernetes_executor.py` line 115):
62
+ ```python
63
+ desired = min(self.max_replicas, current + delta)
64
+ ```
65
+
66
+ **Set `ANTIATROPOS_MAX_REPLICAS=6` on your HF Space.**
67
+
68
+ ---
69
+
70
+ ## 2. Autoscaling Configuration
71
+
72
+ ### EKS Node Autoscaling
73
+
74
+ The cluster needs to grow nodes when the agent scales workloads. Install the Cluster Autoscaler:
75
+
76
+ ```bash
77
+ helm repo add autoscaler https://kubernetes.github.io/autoscaler
78
+ helm repo update
79
+
80
+ helm install cluster-autoscaler autoscaler/cluster-autoscaler \
81
+ --namespace kube-system \
82
+ -f deploy/aws/cluster-autoscaler-values.yaml
83
+ ```
84
+
85
+ **The node group `maxSize` in `eksctl-cluster.yaml` (4) is your ultimate cost ceiling.**
86
+
87
+ ```
88
+ 4 nodes x $0.0416/hr (t3.medium on-demand) = $0.1664/hr = ~$120/month max
89
+ ```
90
+
91
+ With spot instances, this drops to ~$36/month max.
92
+
93
+ ### What Happens When the Agent Scales Workloads
94
+
95
+ 1. Agent on HF Spaces sends `SCALE_UP` action
96
+ 2. `KubernetesExecutor._scale_deployment()` patches the Deployment's `spec.replicas` via EKS API server
97
+ 3. Kubernetes scheduler tries to place the new pod
98
+ 4. If no node has capacity -> pod is `Pending`
99
+ 5. Cluster Autoscaler sees `Pending` pods -> adds a node (within `maxSize`)
100
+ 6. If `maxSize` is hit -> pod stays `Pending` (agent action succeeded but pod won't schedule)
101
+
102
+ **This is why `maxSize` in the node group is your ultimate cost ceiling.**
103
+
104
+ ---
105
+
106
+ ## 3. Cost Guardrails
107
+
108
+ ### Monthly Cost Caps by Tier
109
+
110
+ | Tier | Max Nodes | Max Workload Pods | Estimated Monthly Cost |
111
+ |---|---|---|---|
112
+ | **Dev/Testing** | 2 | 10 (2/deployment) | ~$80 |
113
+ | **Training** | 3 | 15 (3/deployment) | ~$130 |
114
+ | **Benchmark Suite** | 4 | 24 (~5/deployment) | ~$160 |
115
+ | **Unlimited (danger)** | inf | 100 (20/deployment) | $500+ |
116
+
117
+ ### AWS Budgets — Get Alerts Before You Overspend
118
+
119
+ ```bash
120
+ aws budgets create-budget \
121
+ --account-id $(aws sts get-caller-identity --query Account --output text) \
122
+ --budget '{
123
+ "BudgetName": "AntiAtropos-Monthly",
124
+ "BudgetLimit": {"Amount": "150", "Unit": "USD"},
125
+ "TimeUnit": "MONTHLY",
126
+ "CostFilters": {
127
+ "TagKeyValue": ["user:Project$AntiAtropos"]
128
+ },
129
+ "CostTypes": {
130
+ "IncludeTax": true,
131
+ "IncludeSubscription": true,
132
+ "UseBlended": false
133
+ }
134
+ }'
135
+
136
+ # Alert at 50%
137
+ aws budgets create-notification \
138
+ --account-id $(aws sts get-caller-identity --query Account --output text) \
139
+ --budget-name "AntiAtropos-Monthly" \
140
+ --notification '{"NotificationType":"ACTUAL","ComparisonOperator":"GREATER_THAN","Threshold":50}' \
141
+ --subscribers '[{"SubscriptionType":"EMAIL","Address":"your-email@example.com"}]'
142
+
143
+ # Alert at 80%
144
+ aws budgets create-notification \
145
+ --account-id $(aws sts get-caller-identity --query Account --output text) \
146
+ --budget-name "AntiAtropos-Monthly" \
147
+ --notification '{"NotificationType":"ACTUAL","ComparisonOperator":"GREATER_THAN","Threshold":80}' \
148
+ --subscribers '[{"SubscriptionType":"EMAIL","Address":"your-email@example.com"}]'
149
+ ```
150
+
151
+ ### Cost-Saving Checklist
152
+
153
+ - [ ] Use **spot instances** for node groups (60-70% cheaper, OK for training)
154
+ - [ ] Set `ANTIATROPOS_MAX_REPLICAS=6` on HF Spaces (not 20) to prevent agent runaway
155
+ - [ ] Cap node group `maxSize` at 4 (in `eksctl-cluster.yaml`)
156
+ - [ ] Set AWS Budget alert at $150/month
157
+ - [ ] Scale workloads to zero between runs: `kubectl scale deployment -n prod-sre --replicas=0 --all`
158
+ - [ ] Delete the cluster for multi-day breaks: `eksctl delete cluster --name antiatropos`
159
+ - [ ] AMP free tier covers first 10GB ingest/month
160
+ - [ ] AMG free tier is 1 editor for 30 days — cancel if not needed
161
+
162
+ ---
163
+
164
+ ## 4. Step-by-Step Deployment Walkthrough
165
+
166
+ ### Before You Start
167
+
168
+ You need:
169
+ - AWS account with billing alerts enabled
170
+ - AWS CLI v2 installed and configured (`aws configure`)
171
+ - eksctl, kubectl, helm installed
172
+ - About 20-30 minutes
173
+
174
+ ### Step 1: Create the EKS Cluster (15 min)
175
+
176
+ ```bash
177
+ eksctl create cluster -f deploy/aws/eksctl-cluster.yaml
178
+
179
+ # Verify
180
+ aws eks update-kubeconfig --name antiatropos --region ap-south-1
181
+ kubectl get nodes
182
+ ```
183
+
184
+ ### Step 2: Deploy Sample Workloads (1 min)
185
+
186
+ ```bash
187
+ kubectl apply -f deploy/aws/k8s-workloads.yaml
188
+ kubectl get pods -n prod-sre
189
+ ```
190
+
191
+ ### Step 3: Create AMP Workspace (1 min)
192
+
193
+ ```bash
194
+ aws amp create-workspace --alias antiatropos-metrics --region ap-south-1
195
+
196
+ # Note the workspace ID
197
+ aws amp list-workspaces --alias antiatropos-metrics --region ap-south-1 --query 'workspaces[0].workspaceId' --output text
198
+ ```
199
+
200
+ ### Step 4: Set Up IRSA (2 min)
201
+
202
+ ```bash
203
+ # Prometheus agent needs to write to AMP
204
+ eksctl create iamserviceaccount \
205
+ --cluster antiatropos \
206
+ --namespace monitoring \
207
+ --name prometheus-sa \
208
+ --attach-policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess \
209
+ --approve
210
+ ```
211
+
212
+ ### Step 5: Install Prometheus Agent (2 min)
213
+
214
+ ```bash
215
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
216
+ helm repo update
217
+
218
+ # Replace WORKSPACE_ID
219
+ helm install prometheus-agent prometheus-community/prometheus \
220
+ --namespace monitoring --create-namespace \
221
+ -f deploy/aws/prometheus-agent-values.yaml \
222
+ --set "prometheus.prometheusSpec.remoteWrite[0].url=https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/remote_write"
223
+ ```
224
+
225
+ ### Step 6: Set Up AMG (5 min)
226
+
227
+ ```bash
228
+ # Create IAM role for AMG
229
+ aws iam create-role \
230
+ --role-name AntiAtroposGrafanaRole \
231
+ --assume-role-policy-document file://deploy/aws/grafana-trust-policy.json
232
+
233
+ aws iam attach-role-policy \
234
+ --role-name AntiAtroposGrafanaRole \
235
+ --policy-arn arn:aws:iam::aws:policy/AmazonPrometheusQueryAccess
236
+
237
+ # Create workspace
238
+ aws grafana create-workspace \
239
+ --workspace-name antiatropos-dashboards \
240
+ --account-access-type CURRENT_ACCOUNT \
241
+ --authentication-method AWS_SSO \
242
+ --permission-type SERVICE_MANAGED \
243
+ --data-sources PROMETHEUS \
244
+ --region ap-south-1
245
+ ```
246
+
247
+ Then in the AMG web UI:
248
+ 1. Sign in with AWS SSO
249
+ 2. Configuration -> Data Sources -> Add AMP workspace
250
+ 3. Dashboards -> Import -> Upload JSON from `deploy/grafana/provisioning/dashboards/json/`
251
+ 4. Select AMP data source when importing
252
+
253
+ ### Step 7: Install Cluster Autoscaler (2 min)
254
+
255
+ ```bash
256
+ helm repo add autoscaler https://kubernetes.github.io/autoscaler
257
+ helm repo update
258
+
259
+ helm install cluster-autoscaler autoscaler/cluster-autoscaler \
260
+ --namespace kube-system \
261
+ -f deploy/aws/cluster-autoscaler-values.yaml
262
+ ```
263
+
264
+ ### Step 8: Generate Kubeconfig for HF Spaces (1 min)
265
+
266
+ ```bash
267
+ ./deploy/aws/generate-kubeconfig.sh
268
+ # Outputs: deploy/aws/kubeconfig-antiatropos.yaml
269
+ ```
270
+
271
+ ### Step 9: Configure HF Spaces
272
+
273
+ See [Section 5](#5-configuring-hf-spaces-to-connect-to-aws) below.
274
+
275
+ ---
276
+
277
+ ## 5. Configuring HF Spaces to Connect to AWS
278
+
279
+ ### Secrets (HF Space Settings -> Repository secrets)
280
+
281
+ | Secret | Value |
282
+ |---|---|
283
+ | `OPENAI_API_KEY` | Your OpenAI API key |
284
+ | `KUBECONFIG_CONTENT` | Base64-encoded content of `kubeconfig-antiatropos.yaml` |
285
+
286
+ To encode the kubeconfig:
287
+ ```bash
288
+ cat deploy/aws/kubeconfig-antiatropos.yaml | base64 -w 0
289
+ ```
290
+
291
+ ### Environment Variables (HF Space Settings -> Variables)
292
+
293
+ | Variable | Value |
294
+ |---|---|
295
+ | `ANTIATROPOS_ENV_MODE` | `live` |
296
+ | `ANTIATROPOS_STRICT_REAL` | `false` |
297
+ | `PROMETHEUS_URL` | `https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID` |
298
+ | `KUBECONFIG` | `/app/kubeconfig.yaml` |
299
+ | `ANTIATROPOS_K8S_NAMESPACE` | `prod-sre` |
300
+ | `ANTIATROPOS_DEPLOYMENT_PREFIX` | `` (empty) |
301
+ | `ANTIATROPOS_MIN_REPLICAS` | `1` |
302
+ | `ANTIATROPOS_MAX_REPLICAS` | `6` |
303
+ | `ANTIATROPOS_SCALE_STEP` | `3` |
304
+ | `ANTIATROPOS_PROM_TIMEOUT_S` | `5.0` |
305
+ | `ANTIATROPOS_METRIC_AGGREGATION` | `sum` |
306
+ | `ANTIATROPOS_WORKLOAD_MAP` | See below |
307
+
308
+ ### Workload Map Value
309
+
310
+ ```json
311
+ {
312
+ "node-0": {"deployment": "payments", "namespace": "prod-sre"},
313
+ "node-1": {"deployment": "checkout", "namespace": "prod-sre"},
314
+ "node-2": {"deployment": "catalog", "namespace": "prod-sre"},
315
+ "node-3": {"deployment": "cart", "namespace": "prod-sre"},
316
+ "node-4": {"deployment": "auth", "namespace": "prod-sre"}
317
+ }
318
+ ```
319
+
320
+ ### Entrypoint Modification
321
+
322
+ Add this to `deploy/entrypoint.sh` before the uvicorn line, so the kubeconfig is decoded from the HF secret:
323
+
324
+ ```bash
325
+ # Decode kubeconfig from HF Spaces secret
326
+ if [ -n "${KUBECONFIG_CONTENT:-}" ]; then
327
+ echo "${KUBECONFIG_CONTENT}" | base64 -d > /app/kubeconfig.yaml
328
+ export KUBECONFIG=/app/kubeconfig.yaml
329
+ fi
330
+ ```
331
+
332
+ ### Verifying the Connection
333
+
334
+ After deploying, check from HF Spaces that the server can reach AWS:
335
+
336
+ 1. Check the HF Space logs for `antiatropos_step` events
337
+ 2. Look for `Ack: SCALE_UP` messages (agent is reaching EKS)
338
+ 3. Look for non-zero `request_rate` / `cpu_utilization` (PrometheusClient is reaching AMP)
339
+ 4. If `ANTIATROPOS_STRICT_REAL=false` (recommended), failures fall back to mock silently
340
+
341
+ ---
342
+
343
+ ## 6. Day-2 Operations
344
+
345
+ ### Scaling Workloads Manually
346
+
347
+ ```bash
348
+ # Scale a specific deployment
349
+ kubectl scale deployment/payments -n prod-sre --replicas=4
350
+
351
+ # Scale all workloads down
352
+ kubectl scale deployment -n prod-sre --replicas=0 --all
353
+
354
+ # Scale all workloads back up
355
+ kubectl scale deployment payments -n prod-sre --replicas=2
356
+ kubectl scale deployment checkout -n prod-sre --replicas=1
357
+ kubectl scale deployment catalog -n prod-sre --replicas=1
358
+ kubectl scale deployment cart -n prod-sre --replicas=1
359
+ kubectl scale deployment auth -n prod-sre --replicas=1
360
+ ```
361
+
362
+ ### Pausing Everything (Without Deleting)
363
+
364
+ ```bash
365
+ # Scale all workloads to 0
366
+ kubectl scale deployment -n prod-sre --replicas=0 --all
367
+
368
+ # Note: EKS nodes still run and cost money.
369
+ # For real savings, delete the cluster (Section 7).
370
+ ```
371
+
372
+ ### Monitoring Agent Behavior
373
+
374
+ Watch what the SRE agent is doing in real-time:
375
+
376
+ ```bash
377
+ # Check how many workload pods the agent has created
378
+ kubectl get deployments -n prod-sre
379
+
380
+ # Check current replica counts
381
+ kubectl get hpa -A # if any HPAs are defined
382
+
383
+ # Check node pressure
384
+ kubectl top nodes
385
+ ```
386
+
387
+ ### Checking Current Spend
388
+
389
+ ```bash
390
+ # Current month cost by service
391
+ aws ce get-cost-and-usage \
392
+ --time-period Start=$(date -d '1st of this month' +%Y-%m-%d),End=$(date +%Y-%m-%d) \
393
+ --granularity MONTHLY \
394
+ --metrics BlendedCost \
395
+ --group-by Type=DIMENSION,Key=SERVICE
396
+ ```
397
+
398
+ ### Regenerating Kubeconfig
399
+
400
+ If the EKS cluster is recreated or credentials expire:
401
+
402
+ ```bash
403
+ ./deploy/aws/generate-kubeconfig.sh
404
+ # Re-upload the base64-encoded content to HF Spaces secret KUBECONFIG_CONTENT
405
+ ```
406
+
407
+ ---
408
+
409
+ ## 7. Teardown & Cost Recovery
410
+
411
+ ### Partial Teardown (Keep Cluster, Stop Workloads)
412
+
413
+ ```bash
414
+ kubectl scale deployment -n prod-sre --replicas=0 --all
415
+ # Still paying for EKS control plane ($73/month) and idle nodes
416
+ ```
417
+
418
+ ### Full Teardown (Stop All Charges)
419
+
420
+ ```bash
421
+ # Delete workloads
422
+ kubectl delete -f deploy/aws/k8s-workloads.yaml
423
+
424
+ # Delete Prometheus agent
425
+ helm uninstall prometheus-agent -n monitoring
426
+ kubectl delete namespace monitoring
427
+
428
+ # Delete AMP workspace
429
+ AMP_WS_ID=$(aws amp list-workspaces --alias antiatropos-metrics --region ap-south-1 --query 'workspaces[0].workspaceId' --output text)
430
+ aws amp delete-workspace --workspace-id $AMP_WS_ID --region ap-south-1
431
+
432
+ # Delete AMG workspace
433
+ AMG_WS_ID=$(aws grafana list-workspaces --region ap-south-1 --query 'workspaces[0].id' --output text)
434
+ aws grafana delete-workspace --workspace-id $AMG_WS_ID
435
+
436
+ # Delete IAM role for Grafana
437
+ aws iam detach-role-policy --role-name AntiAtroposGrafanaRole --policy-arn arn:aws:iam::aws:policy/AmazonPrometheusQueryAccess
438
+ aws iam detach-role-policy --role-name AntiAtroposGrafanaRole --policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess
439
+ aws iam delete-role --role-name AntiAtroposGrafanaRole
440
+
441
+ # Delete the EKS cluster (10-15 min)
442
+ eksctl delete cluster --name antiatropos --region ap-south-1
443
+
444
+ # Verify nothing is left
445
+ aws eks list-clusters --region ap-south-1
446
+ aws amp list-workspaces --region ap-south-1
447
+ ```
448
+
449
+ Also remove the `KUBECONFIG_CONTENT` secret and reset `PROMETHEUS_URL` to `mock` in your HF Space.
450
+
451
+ ---
452
+
453
+ ## Quick Reference Card
454
+
455
+ | Task | Command |
456
+ |---|---|
457
+ | Deploy AWS infra | `./deploy/aws/deploy.sh` |
458
+ | Check workloads | `kubectl get pods -n prod-sre` |
459
+ | Check monitoring | `kubectl get pods -n monitoring` |
460
+ | Scale a workload | `kubectl scale deployment/payments -n prod-sre --replicas=N` |
461
+ | Pause all workloads | `kubectl scale deployment -n prod-sre --replicas=0 --all` |
462
+ | Check AMP data | `awscurl --service aps "https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WS_ID/api/v1/query?query=up" --region ap-south-1` |
463
+ | Generate kubeconfig | `./deploy/aws/generate-kubeconfig.sh` |
464
+ | Nuke everything | `eksctl delete cluster --name antiatropos --region ap-south-1` |
465
+
deploy/aws/README.md ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AntiAtropos AWS Deployment Guide
2
+
3
+ Deploy the AWS infrastructure (EKS + AMP) that AntiAtropos on Hugging Face Spaces connects to.
4
+
5
+ For FastAPI wiring with `aws` mode and laptop Grafana, see [deploy/aws/FASTAPI_AWS_MODE_GUIDE.md](deploy/aws/FASTAPI_AWS_MODE_GUIDE.md).
6
+
7
+ ## Architecture
8
+
9
+ ```
10
+ Hugging Face Spaces AWS Region (ap-south-1)
11
+ ===================== ======================
12
+ ┌─────────────────────────┐
13
+ │ EKS Cluster │
14
+ ┌─────────────────┐ │ ├── Workload pods │
15
+ │ AntiAtropos │ PROMETHEUS_URL │ │ (payments, checkout │
16
+ │ FastAPI Server │───────────────────>│ │ catalog, cart, auth)│
17
+ │ (port 7860) │ (HTTPS + SigV4) │ ├── Prometheus Agent │
18
+ │ │ │ │ (scrapes workloads, │
19
+ │ │ KUBECONFIG │ │ remote-writes AMP) │
20
+ │ │───────────────────>│ ├── Grafana │
21
+ │ │ (EKS API server) │ │ (self-hosted, │
22
+ │ │ │ │ dashboards) │
23
+ │ │ │ └── Monitoring ns │
24
+ │ │ └─────────────────────────┘
25
+ │ │ ┌─────────────────────────┐
26
+ │ │ │ Amazon Managed │
27
+ │ │ │ Prometheus (AMP) │
28
+ │ │ │ Workspace: antiatropos │
29
+ │ │ └─────────────────────────┘
30
+ └─────────────────┘
31
+ ```
32
+
33
+ **Key principle: FastAPI runs on HF Spaces. AWS runs K8s workloads + AMP + self-hosted Grafana.**
34
+
35
+ ---
36
+
37
+ ## Phase 0: Prerequisites
38
+
39
+ ```bash
40
+ # AWS CLI v2
41
+ curl "https://awscli.amazonaws.com/AWSCLIV2.msi" -o "AWSCLIV2.msi"
42
+ msiexec /i AWSCLIV2.msi
43
+
44
+ # eksctl
45
+ choco install eksctl
46
+
47
+ # kubectl
48
+ choco install kubernetes-cli
49
+
50
+ # Helm
51
+ choco install kubernetes-helm
52
+
53
+ # Authenticate
54
+ aws configure
55
+ ```
56
+
57
+ ---
58
+
59
+ ## Phase 1: Create the EKS Cluster (15 min)
60
+
61
+ ```bash
62
+ eksctl create cluster -f deploy/aws/eksctl-cluster.yaml
63
+
64
+ # Verify
65
+ aws eks update-kubeconfig --name antiatropos --region ap-south-1
66
+ kubectl get nodes
67
+ ```
68
+
69
+ ---
70
+
71
+ ## Phase 2: Deploy Sample Workloads on EKS
72
+
73
+ These are the microservice deployments the SRE agent will scale up/down:
74
+
75
+ ```bash
76
+ kubectl apply -f deploy/aws/k8s-workloads.yaml
77
+ ```
78
+
79
+ This creates 5 deployments in the `prod-sre` namespace:
80
+ - `payments` (node-0, VIP) — 2 replicas
81
+ - `checkout` (node-1) — 1 replica
82
+ - `catalog` (node-2) — 1 replica
83
+ - `cart` (node-3) — 1 replica
84
+ - `auth` (node-4) — 1 replica
85
+
86
+ Verify:
87
+ ```bash
88
+ kubectl get pods -n prod-sre
89
+ ```
90
+
91
+ ---
92
+
93
+ ## Phase 3: Set Up Amazon Managed Prometheus (AMP)
94
+
95
+ ### Create AMP Workspace
96
+
97
+ ```bash
98
+ aws amp create-workspace \
99
+ --alias antiatropos-metrics \
100
+ --region ap-south-1
101
+
102
+ # Note the workspace ID
103
+ aws amp list-workspaces --alias antiatropos-metrics --region ap-south-1
104
+ ```
105
+
106
+ ### Set Up IRSA for Prometheus Agent
107
+
108
+ ```bash
109
+ eksctl create iamserviceaccount \
110
+ --cluster antiatropos \
111
+ --namespace monitoring \
112
+ --name prometheus-sa \
113
+ --attach-policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess \
114
+ --approve \
115
+ --override-existing-serviceaccounts
116
+ ```
117
+
118
+ ### Install Prometheus Agent on EKS
119
+
120
+ The agent scrapes workload pods and remote-writes metrics to AMP:
121
+
122
+ ```bash
123
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
124
+ helm repo update
125
+
126
+ # Replace WORKSPACE_ID with your AMP workspace ID
127
+ helm install prometheus-agent prometheus-community/prometheus \
128
+ --namespace monitoring --create-namespace \
129
+ -f deploy/aws/prometheus-agent-values.yaml \
130
+ --set prometheus.prometheusSpec.remoteWrite[0].url="https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/remote_write"
131
+ ```
132
+
133
+ ### Verify AMP is Receiving Data
134
+
135
+ ```bash
136
+ pip install awscurl
137
+ awscurl --service aps "https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/query?query=up" --region ap-south-1
138
+ ```
139
+
140
+ ---
141
+
142
+ ## Phase 4 (Optional): Set Up Self-Hosted Grafana on EKS
143
+
144
+ If you are on free-tier nodes, skip this section and run Grafana locally on your laptop.
145
+
146
+ ### Install Grafana
147
+
148
+ ```bash
149
+ helm repo add grafana https://grafana.github.io/helm-charts
150
+ helm repo update
151
+
152
+ helm install grafana grafana/grafana \
153
+ --namespace monitoring \
154
+ -f deploy/aws/grafana-values.yaml
155
+ ```
156
+
157
+ ### Create Dashboard Secret
158
+
159
+ ```bash
160
+ kubectl create secret generic antiatropos-grafana-dashboards \
161
+ --from-file=antiatropos-overview.json=deploy/grafana/provisioning/dashboards/json/antiatropos-overview.json \
162
+ --from-file=antiatropos-live.json=deploy/grafana/provisioning/dashboards/json/antiatropos-live.json \
163
+ --namespace monitoring \
164
+ --dry-run=client -o yaml | kubectl apply -f -
165
+ ```
166
+
167
+ ### Access Grafana
168
+
169
+ ```bash
170
+ kubectl port-forward svc/grafana 3000 -n monitoring
171
+ ```
172
+
173
+ Open `http://localhost:3000` in your browser:
174
+ - Username: `admin`
175
+ - Password: `antiatropos`
176
+
177
+ The data source `AMP-Local` is pre-configured to use the local Prometheus agent, and dashboards are auto-imported from the secret.
178
+
179
+ ---
180
+
181
+ ## Phase 5: Generate Kubeconfig for HF Spaces
182
+
183
+ The AntiAtropos server on HF Spaces needs a kubeconfig to talk to EKS:
184
+
185
+ ```bash
186
+ ./deploy/aws/generate-kubeconfig.sh
187
+ ```
188
+
189
+ This outputs `deploy/aws/kubeconfig-antiatropos.yaml`. You'll set this as a secret on HF Spaces.
190
+
191
+ ---
192
+
193
+ ## Phase 6: Configure HF Spaces Environment Variables
194
+
195
+ Set these in your HF Space (Settings → Repository secrets and Variables):
196
+
197
+ ### Secrets
198
+
199
+ | Secret | Value |
200
+ |---|---|
201
+ | `OPENAI_API_KEY` | Your OpenAI API key |
202
+ | `KUBECONFIG_CONTENT` | Full content of `kubeconfig-antiatropos.yaml`, base64-encoded |
203
+
204
+ ### Environment Variables
205
+
206
+ | Variable | Value |
207
+ |---|---|
208
+ | `ANTIATROPOS_ENV_MODE` | `aws` |
209
+ | `ANTIATROPOS_STRICT_REAL` | `false` |
210
+ | `PROMETHEUS_URL` | `https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID` |
211
+ | `KUBECONFIG` | `/app/kubeconfig.yaml` |
212
+ | `ANTIATROPOS_K8S_NAMESPACE` | `prod-sre` |
213
+ | `ANTIATROPOS_MAX_REPLICAS` | `6` |
214
+ | `ANTIATROPOS_MIN_REPLICAS` | `1` |
215
+ | `ANTIATROPOS_SCALE_STEP` | `3` |
216
+ | `ANTIATROPOS_PROM_TIMEOUT_S` | `5.0` |
217
+ | `ANTIATROPOS_METRIC_AGGREGATION` | `sum` |
218
+ | `ANTIATROPOS_WORKLOAD_MAP` | See below |
219
+
220
+ ### Workload Map
221
+
222
+ ```json
223
+ {
224
+ "node-0": {"deployment": "payments", "namespace": "prod-sre"},
225
+ "node-1": {"deployment": "checkout", "namespace": "prod-sre"},
226
+ "node-2": {"deployment": "catalog", "namespace": "prod-sre"},
227
+ "node-3": {"deployment": "cart", "namespace": "prod-sre"},
228
+ "node-4": {"deployment": "auth", "namespace": "prod-sre"}
229
+ }
230
+ ```
231
+
232
+ ### Entrypoint Addition
233
+
234
+ Add this to `deploy/entrypoint.sh` before starting uvicorn, so the kubeconfig is decoded from the HF secret:
235
+
236
+ ```bash
237
+ # Decode kubeconfig from HF Spaces secret
238
+ if [ -n "${KUBECONFIG_CONTENT:-}" ]; then
239
+ echo "${KUBECONFIG_CONTENT}" | base64 -d > /app/kubeconfig.yaml
240
+ export KUBECONFIG=/app/kubeconfig.yaml
241
+ fi
242
+ ```
243
+
244
+ ### FastAPI Reset Mode
245
+
246
+ Use `mode="aws"` on environment reset for AWS-backed execution. If omitted, the server will use `ANTIATROPOS_ENV_MODE`.
247
+
248
+ ---
249
+
250
+ ## Local Grafana (Recommended on Free Tier)
251
+
252
+ Grafana is only for observability dashboards. Agent action execution stays in FastAPI + Kubernetes executor.
253
+
254
+ Start Grafana locally:
255
+
256
+ ```bash
257
+ docker run -d --name antiatropos-grafana -p 3000:3000 grafana/grafana:latest
258
+ ```
259
+
260
+ Then in Grafana:
261
+
262
+ 1. Add Prometheus datasource using AMP workspace URL:
263
+ - `https://aps-workspaces.<region>.amazonaws.com/workspaces/<WORKSPACE_ID>`
264
+ 2. Enable SigV4 auth and set the same AWS region.
265
+ 3. Import dashboards:
266
+ - [deploy/grafana/provisioning/dashboards/json/antiatropos-overview.json](deploy/grafana/provisioning/dashboards/json/antiatropos-overview.json)
267
+ - [deploy/grafana/provisioning/dashboards/json/antiatropos-live.json](deploy/grafana/provisioning/dashboards/json/antiatropos-live.json)
268
+
269
+ ---
270
+
271
+ ## Phase 7: Install Cluster Autoscaler
272
+
273
+ So EKS can add nodes when the agent scales workloads:
274
+
275
+ ```bash
276
+ helm repo add autoscaler https://kubernetes.github.io/autoscaler
277
+ helm repo update
278
+
279
+ helm install cluster-autoscaler autoscaler/cluster-autoscaler \
280
+ --namespace kube-system \
281
+ -f deploy/aws/cluster-autoscaler-values.yaml
282
+ ```
283
+
284
+ The node group `maxSize: 4` in `eksctl-cluster.yaml` caps your compute cost.
285
+
286
+ ---
287
+
288
+ ## Cost Estimates
289
+
290
+ | Resource | Config | Monthly Cost (approx) |
291
+ |---|---|---|
292
+ | EKS Control Plane | 1 cluster | $73 |
293
+ | EKS Nodes | 2x t3.medium | $60 |
294
+ | AMP | <10GB ingest | ~$3-5 |
295
+ | EBS Volume (Grafana) | 5Gi | ~$0.50 |
296
+ | **Total** | | **~$135-145/month** |
297
+ | HF Spaces | Free tier or $5/mo | (separate billing) |
298
+
299
+ No ECR, no ALB, no server pods on AWS — cheaper than running everything on AWS.
300
+
301
+ ### Cost-Saving Tips
302
+
303
+ - Use spot instances for node groups (60-70% cheaper)
304
+ - Scale workloads to zero between runs: `kubectl scale deployment -n prod-sre --replicas=0 --all`
305
+ - Delete the cluster between training runs: `eksctl delete cluster --name antiatropos`
306
+ - AMP free tier covers first 10GB ingest/month
307
+ - Grafana is self-hosted (free, runs on EKS)
308
+
309
+ ---
310
+
311
+ ## Teardown
312
+
313
+ ```bash
314
+ # Delete workloads
315
+ kubectl delete -f deploy/aws/k8s-workloads.yaml
316
+
317
+ # Delete Grafana
318
+ helm uninstall grafana -n monitoring
319
+
320
+ # Delete Prometheus agent
321
+ helm uninstall prometheus-agent -n monitoring
322
+ kubectl delete namespace monitoring
323
+
324
+ # Delete dashboard secret
325
+ kubectl delete secret antiatropos-grafana-dashboards -n monitoring 2>/dev/null || true
326
+
327
+ # Delete AMP workspace
328
+ AMP_WS_ID=$(aws amp list-workspaces --alias antiatropos-metrics --region ap-south-1 --query 'workspaces[0].workspaceId' --output text)
329
+ aws amp delete-workspace --workspace-id $AMP_WS_ID --region ap-south-1
330
+
331
+ # Delete the EKS cluster (10-15 min)
332
+ eksctl delete cluster --name antiatropos --region ap-south-1
333
+ ```
334
+
335
+ ---
336
+
337
+ ## Troubleshooting
338
+
339
+ ### HF Spaces can't reach AMP
340
+ - Verify `PROMETHEUS_URL` includes the full workspace path
341
+ - AMP requires SigV4 auth — ensure `requests-aws4auth` is in your dependencies
342
+ - Set `ANTIATROPOS_PROM_TIMEOUT_S=5.0` (cross-network latency)
343
+
344
+ ### HF Spaces can't reach EKS
345
+ - Verify `KUBECONFIG` path and the file is decoded properly
346
+ - Check the EKS API server endpoint is public (default)
347
+ - Verify the IAM user in the kubeconfig has EKS access
348
+ - Test locally: `kubectl --kubeconfig=kubeconfig-antiatropos.yaml get nodes`
349
+
350
+ ### AMP not receiving metrics
351
+ ```bash
352
+ kubectl logs -n monitoring -l app.kubernetes.io/name=prometheus
353
+ ```
354
+
355
+ ### Grafana shows no data
356
+ 1. Verify the `AMP-Local` data source is configured: `http://prometheus-agent-server.monitoring.svc.cluster.local:80`
357
+ 2. Check time range (AMP default retention is 30 days)
358
+ 3. Verify PromQL queries match your metric names
359
+ 4. Check Grafana logs: `kubectl logs -n monitoring -l app.kubernetes.io/name=grafana`
360
+ 5. Verify dashboards secret exists: `kubectl get secret antiatropos-grafana-dashboards -n monitoring`
361
+
deploy/aws/cluster-autoscaler-values.yaml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cluster Autoscaler Helm values
2
+ #
3
+ # This ensures EKS adds/removes nodes based on pod scheduling pressure.
4
+ # The node group maxSize in eksctl-cluster.yaml (4) is the ultimate cap.
5
+ #
6
+ # Install:
7
+ # helm repo add autoscaler https://kubernetes.github.io/autoscaler
8
+ # helm repo update
9
+ # helm install cluster-autoscaler autoscaler/cluster-autoscaler \
10
+ # --namespace kube-system \
11
+ # -f cluster-autoscaler-values.yaml
12
+
13
+ autoDiscovery:
14
+ clusterName: antiatropos
15
+ enabled: true
16
+
17
+ awsRegion: ap-south-1
18
+
19
+ # Only scale nodes that have the specific tag
20
+ # This prevents autoscaling unrelated node groups if you add them later
21
+ nodeGroupAutoDiscovery:
22
+ - tags: cluster-autoscaler/cluster-name=antiatropos
23
+
24
+ # Conservative scaling — don't overreact
25
+ scaleDown:
26
+ enabled: true
27
+ # Wait 10 minutes before removing a node
28
+ # This prevents flapping when agents create/destroy pods frequently
29
+ delayAfterAdd: 600s
30
+ delayAfterDelete: 60s
31
+ delayAfterScaleDown: 600s
32
+ # Only remove nodes that are below 50% utilization
33
+ utilizationThreshold: "0.5"
34
+ # Don't remove nodes that have AntiAtropos pods on them
35
+ # (we don't want to kill active training sessions)
36
+ skipNodesWithSystemPods: true
37
+
38
+ # Don't try to scale beyond this many nodes total
39
+ # This is a safety net — the eksctl node group maxSize is the real limit
40
+ maxNodeProvisionTime: 15m
41
+
42
+ rbac:
43
+ create: true
44
+ serviceAccount:
45
+ create: true
46
+ name: cluster-autoscaler
47
+
48
+ replicaCount: 1
49
+
50
+ resources:
51
+ requests:
52
+ cpu: 100m
53
+ memory: 256Mi
54
+ limits:
55
+ cpu: 500m
56
+ memory: 512Mi
57
+
deploy/aws/deploy-all.ps1 ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AntiAtropos - One-Run Deploy Script
2
+ # Deploys entire AWS infrastructure: EKS cluster, workloads, AMP, Prometheus, Grafana
3
+
4
+ $ErrorActionPreference = "Stop"
5
+
6
+ # In PowerShell 7+, prevent native stderr from becoming terminating errors.
7
+ if (Get-Variable -Name PSNativeCommandUseErrorActionPreference -ErrorAction SilentlyContinue) {
8
+ $PSNativeCommandUseErrorActionPreference = $false
9
+ }
10
+
11
+ $Region = "ap-south-1"
12
+ $ClusterName = "antiatropos"
13
+ $AwsDir = Split-Path -Parent $MyInvocation.MyCommand.Path
14
+ $GrafanaMode = if ([string]::IsNullOrWhiteSpace($env:ANTIATROPOS_GRAFANA_MODE)) { "auto" } else { $env:ANTIATROPOS_GRAFANA_MODE.Trim().ToLowerInvariant() }
15
+ $GrafanaModeResolved = "cluster"
16
+
17
+ function Invoke-CheckedCommand {
18
+ param(
19
+ [ScriptBlock]$Command,
20
+ [string]$ErrorMessage
21
+ )
22
+
23
+ $previousErrorActionPreference = $ErrorActionPreference
24
+ $ErrorActionPreference = "Continue"
25
+ try {
26
+ & $Command
27
+ } finally {
28
+ $ErrorActionPreference = $previousErrorActionPreference
29
+ }
30
+
31
+ if ($LASTEXITCODE -ne 0) {
32
+ throw $ErrorMessage
33
+ }
34
+ }
35
+
36
+ function Get-EksClusterStatus {
37
+ param(
38
+ [string]$Name,
39
+ [string]$AwsRegion
40
+ )
41
+
42
+ try {
43
+ $status = aws eks describe-cluster --name $Name --region $AwsRegion --query 'cluster.status' --output text 2>$null
44
+ } catch {
45
+ return $null
46
+ }
47
+
48
+ if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($status) -or $status -eq "None") {
49
+ return $null
50
+ }
51
+ return $status.Trim()
52
+ }
53
+
54
+ function Test-EksNodegroupExists {
55
+ param(
56
+ [string]$Cluster,
57
+ [string]$Nodegroup,
58
+ [string]$AwsRegion
59
+ )
60
+
61
+ try {
62
+ aws eks describe-nodegroup --cluster-name $Cluster --nodegroup-name $Nodegroup --region $AwsRegion --query 'nodegroup.nodegroupName' --output text 2>$null | Out-Null
63
+ return ($LASTEXITCODE -eq 0)
64
+ } catch {
65
+ return $false
66
+ }
67
+ }
68
+
69
+ function Get-EksNodegroupInstanceType {
70
+ param(
71
+ [string]$Cluster,
72
+ [string]$Nodegroup,
73
+ [string]$AwsRegion
74
+ )
75
+
76
+ try {
77
+ $instanceType = aws eks describe-nodegroup --cluster-name $Cluster --nodegroup-name $Nodegroup --region $AwsRegion --query 'nodegroup.instanceTypes[0]' --output text 2>$null
78
+ } catch {
79
+ return $null
80
+ }
81
+
82
+ if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($instanceType) -or $instanceType -eq "None") {
83
+ return $null
84
+ }
85
+
86
+ return $instanceType.Trim()
87
+ }
88
+
89
+ function Get-NodegroupSubnetSelection {
90
+ param(
91
+ [string]$Cluster,
92
+ [string]$AwsRegion
93
+ )
94
+
95
+ try {
96
+ $allSubnetIds = aws eks describe-cluster --name $Cluster --region $AwsRegion --query 'cluster.resourcesVpcConfig.subnetIds' --output text 2>$null
97
+ } catch {
98
+ throw "Failed to read cluster subnet IDs"
99
+ }
100
+
101
+ if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($allSubnetIds)) {
102
+ throw "Failed to read cluster subnet IDs"
103
+ }
104
+
105
+ $subnetArray = @($allSubnetIds -split '\s+' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })
106
+ if ($subnetArray.Count -eq 0) {
107
+ throw "No subnets found for cluster '$Cluster' in region '$AwsRegion'"
108
+ }
109
+
110
+ $describeSubnetArgs = @(
111
+ 'ec2', 'describe-subnets',
112
+ '--region', $AwsRegion,
113
+ '--subnet-ids'
114
+ ) + $subnetArray + @(
115
+ '--query', 'Subnets[?MapPublicIpOnLaunch==true].SubnetId',
116
+ '--output', 'text'
117
+ )
118
+
119
+ try {
120
+ $publicSubnetIdsText = & aws @describeSubnetArgs 2>$null
121
+ } catch {
122
+ throw "Failed to classify cluster subnets"
123
+ }
124
+
125
+ if ($LASTEXITCODE -ne 0) {
126
+ throw "Failed to classify cluster subnets"
127
+ }
128
+
129
+ $publicSubnetIds = @($publicSubnetIdsText -split '\s+' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) -and $_ -ne "None" })
130
+ $privateSubnetIds = @($subnetArray | Where-Object { $publicSubnetIds -notcontains $_ })
131
+
132
+ if ($publicSubnetIds.Count -gt 0) {
133
+ return [PSCustomObject]@{
134
+ SubnetCsv = ($publicSubnetIds -join ',')
135
+ UsePrivateNetworking = $false
136
+ SubnetType = "public"
137
+ }
138
+ }
139
+
140
+ if ($privateSubnetIds.Count -gt 0) {
141
+ return [PSCustomObject]@{
142
+ SubnetCsv = ($privateSubnetIds -join ',')
143
+ UsePrivateNetworking = $true
144
+ SubnetType = "private"
145
+ }
146
+ }
147
+
148
+ throw "Could not determine valid subnets for nodegroup creation"
149
+ }
150
+
151
+ function Get-ReadyNodeCount {
152
+ $nodeLines = kubectl get nodes --no-headers 2>$null
153
+ if (-not $nodeLines) {
154
+ return 0
155
+ }
156
+
157
+ return (@($nodeLines | Select-String -Pattern '\sReady\s').Count)
158
+ }
159
+
160
+ function Wait-ForReadyNodes {
161
+ param(
162
+ [int]$MinimumReadyNodes,
163
+ [int]$TimeoutSeconds = 600
164
+ )
165
+
166
+ $attempts = [Math]::Ceiling($TimeoutSeconds / 10)
167
+ for ($i = 0; $i -lt $attempts; $i++) {
168
+ $readyCount = Get-ReadyNodeCount
169
+ Write-Host "Nodes ready: $readyCount (target: $MinimumReadyNodes)"
170
+ if ($readyCount -ge $MinimumReadyNodes) {
171
+ return
172
+ }
173
+ Start-Sleep -Seconds 10
174
+ }
175
+
176
+ throw "Timed out waiting for $MinimumReadyNodes Ready nodes"
177
+ }
178
+
179
+ Write-Host ""
180
+ Write-Host "==========================================" -ForegroundColor Cyan
181
+ Write-Host " AntiAtropos AWS Infrastructure Deploy" -ForegroundColor Cyan
182
+ Write-Host "==========================================" -ForegroundColor Cyan
183
+ Write-Host "Region: $Region"
184
+ Write-Host "Cluster: $ClusterName"
185
+ Write-Host ""
186
+
187
+ # Check prerequisites
188
+ $missing = @()
189
+ foreach ($cmd in @("aws", "eksctl", "kubectl", "helm")) {
190
+ if (-not (Get-Command $cmd -ErrorAction SilentlyContinue)) {
191
+ $missing += $cmd
192
+ }
193
+ }
194
+ if ($missing.Count -gt 0) {
195
+ Write-Host "ERROR: Missing: $($missing -join ', ')" -ForegroundColor Red
196
+ exit 1
197
+ }
198
+
199
+ # Phase 1: Create EKS Cluster
200
+ Write-Host ">>> Phase 1: Creating EKS cluster..." -ForegroundColor Yellow
201
+
202
+ $clusterStatus = Get-EksClusterStatus -Name $ClusterName -AwsRegion $Region
203
+
204
+ if ($clusterStatus -eq "DELETING") {
205
+ Write-Host "Cluster is currently deleting. Waiting for deletion to complete..." -ForegroundColor Yellow
206
+ Invoke-CheckedCommand -Command { aws eks wait cluster-deleted --name $ClusterName --region $Region } -ErrorMessage "Failed while waiting for cluster deletion"
207
+ $clusterStatus = $null
208
+ }
209
+
210
+ if (-not $clusterStatus) {
211
+ $TempConfig = Join-Path $AwsDir "eksctl-cluster-only.yaml"
212
+ $ClusterYaml = Get-Content (Join-Path $AwsDir "eksctl-cluster.yaml") -Raw
213
+ $ClusterOnlyYaml = $ClusterYaml -replace '(?s)(managedNodeGroups:.*)', ''
214
+ $ClusterOnlyYaml | Out-File -FilePath $TempConfig -Encoding utf8
215
+ Invoke-CheckedCommand -Command { eksctl create cluster -f $TempConfig } -ErrorMessage "Failed to create EKS cluster"
216
+ Remove-Item $TempConfig -Force
217
+ Write-Host "Cluster created" -ForegroundColor Green
218
+ } else {
219
+ if ($clusterStatus -eq "CREATING") {
220
+ Write-Host "Cluster creation in progress. Waiting until ACTIVE..." -ForegroundColor Yellow
221
+ Invoke-CheckedCommand -Command { aws eks wait cluster-active --name $ClusterName --region $Region } -ErrorMessage "Cluster did not become active"
222
+ }
223
+ Write-Host "Cluster already exists (status: $clusterStatus)" -ForegroundColor Green
224
+ }
225
+
226
+ Invoke-CheckedCommand -Command { aws eks wait cluster-active --name $ClusterName --region $Region } -ErrorMessage "Cluster is not active"
227
+ Invoke-CheckedCommand -Command { aws eks update-kubeconfig --name $ClusterName --region $Region | Out-Null } -ErrorMessage "Failed to update kubeconfig"
228
+
229
+ # Phase 2: Create Nodegroup
230
+ Write-Host ""
231
+ Write-Host ">>> Phase 2: Ensuring compute nodegroup..." -ForegroundColor Yellow
232
+
233
+ $NodegroupName = "linux-nodes"
234
+ $PreferredInstanceType = "t3.micro"
235
+ $ngExists = Test-EksNodegroupExists -Cluster $ClusterName -Nodegroup $NodegroupName -AwsRegion $Region
236
+
237
+ if (-not $ngExists) {
238
+ $SubnetSelection = Get-NodegroupSubnetSelection -Cluster $ClusterName -AwsRegion $Region
239
+ $SubnetCsv = $SubnetSelection.SubnetCsv
240
+ $UsePrivateNetworking = [bool]$SubnetSelection.UsePrivateNetworking
241
+
242
+ Write-Host "Using $($SubnetSelection.SubnetType) subnets: $SubnetCsv"
243
+
244
+ Invoke-CheckedCommand -Command {
245
+ $args = @(
246
+ 'create', 'nodegroup',
247
+ '--cluster', $ClusterName,
248
+ '--region', $Region,
249
+ '--name', $NodegroupName,
250
+ '--node-type', $PreferredInstanceType,
251
+ '--nodes', '4',
252
+ '--nodes-min', '2',
253
+ '--nodes-max', '8',
254
+ '--node-volume-size', '20',
255
+ '--subnet-ids', $SubnetCsv
256
+ )
257
+
258
+ if ($UsePrivateNetworking) {
259
+ $args += '--node-private-networking'
260
+ }
261
+
262
+ eksctl @args
263
+ } -ErrorMessage "Failed to create nodegroup '$NodegroupName'"
264
+
265
+ Write-Host "Nodegroup created" -ForegroundColor Green
266
+ } else {
267
+ $existingInstanceType = Get-EksNodegroupInstanceType -Cluster $ClusterName -Nodegroup $NodegroupName -AwsRegion $Region
268
+ Write-Host "Nodegroup already exists ($existingInstanceType)" -ForegroundColor Green
269
+ }
270
+
271
+ Invoke-CheckedCommand -Command { aws eks wait nodegroup-active --cluster-name $ClusterName --nodegroup-name $NodegroupName --region $Region } -ErrorMessage "Nodegroup did not become active"
272
+
273
+ if ($GrafanaMode -in @("auto", "")) {
274
+ $effectiveNodeType = Get-EksNodegroupInstanceType -Cluster $ClusterName -Nodegroup $NodegroupName -AwsRegion $Region
275
+ if ($effectiveNodeType -eq "t3.micro") {
276
+ $GrafanaModeResolved = "external"
277
+ } else {
278
+ $GrafanaModeResolved = "cluster"
279
+ }
280
+ } elseif ($GrafanaMode -in @("external", "local", "hf")) {
281
+ $GrafanaModeResolved = "external"
282
+ } else {
283
+ $GrafanaModeResolved = "cluster"
284
+ }
285
+
286
+ Write-Host "Grafana mode: $GrafanaModeResolved" -ForegroundColor Cyan
287
+
288
+ Write-Host "Waiting for nodes..."
289
+ for ($i = 0; $i -lt 60; $i++) {
290
+ $nodes = $null
291
+ try {
292
+ $nodes = kubectl get nodes --no-headers --request-timeout=10s 2>$null
293
+ } catch {
294
+ Start-Sleep -Seconds 10
295
+ continue
296
+ }
297
+
298
+ if ($nodes) {
299
+ $readyCount = ($nodes | Select-String -Pattern '\sReady\s').Count
300
+ Write-Host "Nodes ready: $readyCount" -ForegroundColor Green
301
+ break
302
+ }
303
+ Start-Sleep -Seconds 10
304
+ }
305
+
306
+ # Phase 3: Deploy Workloads
307
+ Write-Host ""
308
+ Write-Host ">>> Phase 3: Deploying workloads..." -ForegroundColor Yellow
309
+ kubectl create namespace prod-sre --dry-run=client -o yaml | kubectl apply -f - | Out-Null
310
+ kubectl apply -f (Join-Path $AwsDir "k8s-workloads.yaml") | Out-Null
311
+ Write-Host "Workloads deployed" -ForegroundColor Green
312
+
313
+ # Phase 4: Create AMP Workspace
314
+ Write-Host ""
315
+ Write-Host ">>> Phase 4: Creating AMP workspace..." -ForegroundColor Yellow
316
+
317
+ $AmpWsId = $null
318
+ try {
319
+ $AmpWsId = aws amp list-workspaces --alias antiatropos-metrics --region $Region --query 'workspaces[0].workspaceId' --output text 2>$null
320
+ if ($AmpWsId -eq "None") { $AmpWsId = $null }
321
+ } catch {}
322
+
323
+ if ([string]::IsNullOrWhiteSpace($AmpWsId)) {
324
+ $AmpWsId = aws amp create-workspace --alias antiatropos-metrics --region $Region --query 'workspaceId' --output text
325
+ }
326
+ $AmpUrl = "https://aps-workspaces.$Region.amazonaws.com/workspaces/$AmpWsId"
327
+ Write-Host "AMP: $AmpWsId" -ForegroundColor Green
328
+
329
+ # Phase 5: Install Prometheus
330
+ Write-Host ""
331
+ Write-Host ">>> Phase 5: Installing Prometheus..." -ForegroundColor Yellow
332
+
333
+ kubectl create namespace monitoring --dry-run=client -o yaml | kubectl apply -f - | Out-Null
334
+ Invoke-CheckedCommand -Command { helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>$null | Out-Null } -ErrorMessage "Failed to add prometheus helm repo"
335
+ Invoke-CheckedCommand -Command { helm repo update 2>$null | Out-Null } -ErrorMessage "Failed to update helm repos"
336
+
337
+ $promValuesYaml = Join-Path $AwsDir "prometheus-agent-values.yaml"
338
+ $remoteWriteUrl = "$AmpUrl/api/v1/remote_write"
339
+
340
+ Invoke-CheckedCommand -Command {
341
+ helm upgrade --install prometheus-agent prometheus-community/prometheus --namespace monitoring --reset-values -f $promValuesYaml `
342
+ --set "alertmanager.enabled=false" `
343
+ --set "kube-state-metrics.enabled=false" `
344
+ --set "prometheus-node-exporter.enabled=false" `
345
+ --set "pushgateway.enabled=false" `
346
+ --set "server.enabled=true" `
347
+ --set "server.persistentVolume.enabled=false" `
348
+ --set "server.resources.requests.cpu=50m" `
349
+ --set "server.resources.requests.memory=128Mi" `
350
+ --set "server.resources.limits.cpu=300m" `
351
+ --set "server.resources.limits.memory=384Mi" `
352
+ --set "server.global.scrape_interval=15s" `
353
+ --set "server.remoteWrite[0].url=$remoteWriteUrl" `
354
+ 2>&1 | Out-Null
355
+ } -ErrorMessage "Failed to install/upgrade Prometheus"
356
+ Write-Host "Prometheus installed" -ForegroundColor Green
357
+
358
+ # Phase 6: Install Grafana
359
+ Write-Host ""
360
+ if ($GrafanaModeResolved -eq "cluster") {
361
+ Write-Host ">>> Phase 6: Installing Grafana in-cluster..." -ForegroundColor Yellow
362
+
363
+ Invoke-CheckedCommand -Command { helm repo add grafana https://grafana.github.io/helm-charts 2>$null | Out-Null } -ErrorMessage "Failed to add grafana helm repo"
364
+ Invoke-CheckedCommand -Command { helm repo update 2>$null | Out-Null } -ErrorMessage "Failed to update helm repos"
365
+
366
+ $GrafanaValuesYaml = Join-Path $AwsDir "grafana-values.yaml"
367
+ Invoke-CheckedCommand -Command { helm upgrade --install grafana grafana/grafana --namespace monitoring -f $GrafanaValuesYaml 2>&1 | Out-Null } -ErrorMessage "Failed to install/upgrade Grafana"
368
+
369
+ Write-Host "Waiting for Grafana..."
370
+ try {
371
+ Invoke-CheckedCommand -Command { kubectl rollout status deployment/grafana --namespace monitoring --timeout=120s 2>$null | Out-Null } -ErrorMessage "Grafana rollout timed out"
372
+ } catch {
373
+ $pendingGrafanaPod = kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana --field-selector=status.phase=Pending --no-headers 2>$null | Select-Object -First 1
374
+ $pendingReason = ""
375
+
376
+ if ($pendingGrafanaPod) {
377
+ $pendingGrafanaPodName = ($pendingGrafanaPod -split '\s+')[0]
378
+ $pendingReason = kubectl describe pod $pendingGrafanaPodName -n monitoring 2>$null | Select-String -Pattern "FailedScheduling|Insufficient memory|Too many pods|unbound" -Context 0,2 | Out-String
379
+ if (-not [string]::IsNullOrWhiteSpace($pendingReason)) {
380
+ Write-Host "Grafana is pending due to scheduler constraints:" -ForegroundColor Yellow
381
+ Write-Host $pendingReason -ForegroundColor Yellow
382
+ }
383
+ }
384
+
385
+ $shouldScale = $pendingReason -match "Too many pods|Insufficient memory"
386
+ if ($shouldScale) {
387
+ Write-Host "Scaling nodegroup to 8 nodes and retrying Grafana rollout..." -ForegroundColor Yellow
388
+ Invoke-CheckedCommand -Command { eksctl scale nodegroup --cluster $ClusterName --region $Region --name $NodegroupName --nodes 8 } -ErrorMessage "Failed to scale nodegroup"
389
+ Invoke-CheckedCommand -Command { aws eks wait nodegroup-active --cluster-name $ClusterName --nodegroup-name $NodegroupName --region $Region } -ErrorMessage "Nodegroup did not become active after scaling"
390
+ Write-Host "Waiting for newly scaled nodes to become Ready..." -ForegroundColor Yellow
391
+ Wait-ForReadyNodes -MinimumReadyNodes 8 -TimeoutSeconds 900
392
+
393
+ $pendingGrafanaPodAfterScale = kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana --field-selector=status.phase=Pending --no-headers 2>$null | Select-Object -First 1
394
+ if ($pendingGrafanaPodAfterScale) {
395
+ $pendingGrafanaPodNameAfterScale = ($pendingGrafanaPodAfterScale -split '\s+')[0]
396
+ kubectl delete pod $pendingGrafanaPodNameAfterScale -n monitoring 2>$null | Out-Null
397
+ }
398
+
399
+ Invoke-CheckedCommand -Command { kubectl rollout status deployment/grafana --namespace monitoring --timeout=600s 2>$null | Out-Null } -ErrorMessage "Grafana rollout timed out after scaling"
400
+ } else {
401
+ throw "Grafana rollout failed. Check: kubectl -n monitoring get pods ; kubectl -n monitoring describe pod -l app.kubernetes.io/name=grafana"
402
+ }
403
+ }
404
+ Write-Host "Grafana installed (admin/antiatropos)" -ForegroundColor Green
405
+ } else {
406
+ Write-Host ">>> Phase 6: Skipping in-cluster Grafana (external mode)..." -ForegroundColor Yellow
407
+ $grafanaRelease = ""
408
+ try {
409
+ $grafanaRelease = helm list -n monitoring --filter '^grafana$' --short 2>$null
410
+ } catch {
411
+ $grafanaRelease = ""
412
+ }
413
+
414
+ if (-not [string]::IsNullOrWhiteSpace($grafanaRelease)) {
415
+ helm uninstall grafana -n monitoring 2>$null | Out-Null
416
+ kubectl delete pvc grafana -n monitoring 2>$null | Out-Null
417
+ Write-Host "Removed existing in-cluster Grafana release to save resources" -ForegroundColor Green
418
+ }
419
+ }
420
+
421
+ # Phase 7: Install Cluster Autoscaler
422
+ Write-Host ""
423
+ Write-Host ">>> Phase 7: Installing Cluster Autoscaler..." -ForegroundColor Yellow
424
+
425
+ Invoke-CheckedCommand -Command { helm repo add autoscaler https://kubernetes.github.io/autoscaler 2>$null | Out-Null } -ErrorMessage "Failed to add autoscaler helm repo"
426
+ Invoke-CheckedCommand -Command { helm repo update 2>$null | Out-Null } -ErrorMessage "Failed to update helm repos"
427
+
428
+ $autoscalerValues = Join-Path $AwsDir "cluster-autoscaler-values.yaml"
429
+ Invoke-CheckedCommand -Command { helm upgrade --install cluster-autoscaler autoscaler/cluster-autoscaler --namespace kube-system -f $autoscalerValues 2>&1 | Out-Null } -ErrorMessage "Failed to install/upgrade Cluster Autoscaler"
430
+ Write-Host "Cluster Autoscaler installed" -ForegroundColor Green
431
+
432
+ # Phase 8: Generate Kubeconfig
433
+ Write-Host ""
434
+ Write-Host ">>> Phase 8: Generating kubeconfig..." -ForegroundColor Yellow
435
+
436
+ $ClusterEndpoint = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.endpoint' --output text
437
+ $ClusterCa = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.certificateAuthority.data' --output text
438
+ $Timestamp = (Get-Date).ToUniversalTime().ToString("yyyy-MM-ddTHH:mm:ssZ")
439
+ $output = Join-Path $AwsDir "kubeconfig-antiatropos.yaml"
440
+
441
+ $kubeconfig = "apiVersion: v1`n" +
442
+ "kind: Config`n" +
443
+ "clusters:`n" +
444
+ " - cluster:`n" +
445
+ " certificate-authority-data: $ClusterCa`n" +
446
+ " server: $ClusterEndpoint`n" +
447
+ " name: $ClusterName`n" +
448
+ "contexts:`n" +
449
+ " - context:`n" +
450
+ " cluster: $ClusterName`n" +
451
+ " user: antiatropos-hf-user`n" +
452
+ " name: $ClusterName`n" +
453
+ "current-context: $ClusterName`n" +
454
+ "preferences: {}`n" +
455
+ "users:`n" +
456
+ " - name: antiatropos-hf-user`n" +
457
+ " user:`n" +
458
+ " exec:`n" +
459
+ " apiVersion: client.authentication.k8s.io/v1beta1`n" +
460
+ " command: aws`n" +
461
+ " args:`n" +
462
+ " - eks`n" +
463
+ " - get-token`n" +
464
+ " - --region`n" +
465
+ " - $Region`n" +
466
+ " - --cluster-name`n" +
467
+ " - $ClusterName`n" +
468
+ " env:`n" +
469
+ " - name: AWS_STS_REGIONAL_ENDPOINTS`n" +
470
+ " value: regional`n" +
471
+ " - name: AWS_DEFAULT_REGION`n" +
472
+ " value: $Region`n" +
473
+ " interactiveMode: IfAvailable`n"
474
+
475
+ $kubeconfig | Out-File -FilePath $output -Encoding utf8 -Force
476
+ Write-Host "Kubeconfig: $output" -ForegroundColor Green
477
+
478
+ # Done
479
+ Write-Host ""
480
+ Write-Host "==========================================" -ForegroundColor Cyan
481
+ Write-Host " Deployment Complete!" -ForegroundColor Cyan
482
+ Write-Host "==========================================" -ForegroundColor Cyan
483
+ Write-Host ""
484
+ Write-Host "AMP: $AmpWsId" -ForegroundColor Yellow
485
+ if ($GrafanaModeResolved -eq "cluster") {
486
+ Write-Host "Grafana: kubectl port-forward svc/grafana 3000 -n monitoring" -ForegroundColor Yellow
487
+ Write-Host "Login: admin / antiatropos" -ForegroundColor Yellow
488
+ } else {
489
+ Write-Host "Grafana: external/local mode enabled (recommended for free-tier nodes)" -ForegroundColor Yellow
490
+ Write-Host "Use AMP endpoint as Prometheus datasource with SigV4 auth" -ForegroundColor Yellow
491
+ }
492
+ Write-Host "Kubeconfig: $output" -ForegroundColor Yellow
493
+ Write-Host ""
deploy/aws/deploy.ps1 ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AntiAtropos AWS Infrastructure Deploy Script (PowerShell)
2
+ #
3
+ # Deploys: EKS cluster, sample workloads, AMP workspace, Prometheus Agent,
4
+ # AMG workspace, Cluster Autoscaler, and generates kubeconfig for HF Spaces.
5
+ #
6
+ # The AntiAtropos FastAPI server runs on Hugging Face Spaces, NOT on AWS.
7
+ # This script only sets up the infrastructure that HF Spaces connects to.
8
+ #
9
+ # Prerequisites: aws cli, eksctl, kubectl, helm
10
+ #
11
+ # Usage:
12
+ # .\deploy\aws\deploy.ps1
13
+ #
14
+ # Environment variables:
15
+ # $env:AWS_REGION - AWS region (default: ap-south-1)
16
+ # $env:CLUSTER_NAME - EKS cluster name (default: antiatropos)
17
+
18
+ $ErrorActionPreference = "Stop"
19
+
20
+ $Region = if ($env:AWS_REGION) { $env:AWS_REGION } else { "ap-south-1" }
21
+ $ClusterName = if ($env:CLUSTER_NAME) { $env:CLUSTER_NAME } else { "antiatropos" }
22
+ $AwsDir = Split-Path -Parent $MyInvocation.MyCommand.Path
23
+
24
+ Write-Host ""
25
+ Write-Host "=== AntiAtropos AWS Infrastructure Deployment ===" -ForegroundColor Cyan
26
+ Write-Host "Region: $Region"
27
+ Write-Host "Cluster: $ClusterName"
28
+ Write-Host "FastAPI: Runs on HF Spaces (not deployed here)"
29
+ Write-Host ""
30
+
31
+ # --- Check prerequisites ---
32
+ $missing = @()
33
+ foreach ($cmd in @("aws", "eksctl", "kubectl", "helm")) {
34
+ if (-not (Get-Command $cmd -ErrorAction SilentlyContinue)) {
35
+ $missing += $cmd
36
+ }
37
+ }
38
+ if ($missing.Count -gt 0) {
39
+ Write-Host "ERROR: Missing prerequisites: $($missing -join ', ')" -ForegroundColor Red
40
+ Write-Host "Install them first:" -ForegroundColor Yellow
41
+ Write-Host " choco install awscli eksctl kubernetes-cli kubernetes-helm -y" -ForegroundColor Yellow
42
+ exit 1
43
+ }
44
+
45
+ # --- Phase 1: Create EKS Cluster ---
46
+ Write-Host ""
47
+ Write-Host ">>> Phase 1: Creating EKS cluster (without nodegroup)..." -ForegroundColor Yellow
48
+
49
+ $clusterExists = $false
50
+ try {
51
+ eksctl get cluster --name $ClusterName --region $Region 2>$null | Out-Null
52
+ $clusterExists = $true
53
+ } catch {}
54
+
55
+ if ($clusterExists) {
56
+ Write-Host "Cluster $ClusterName already exists, skipping creation."
57
+ } else {
58
+ # Create cluster without nodegroup first (faster, avoids timeout)
59
+ $TempClusterConfig = Join-Path $AwsDir "eksctl-cluster-only.yaml"
60
+ $ClusterYaml = Get-Content (Join-Path $AwsDir "eksctl-cluster.yaml") -Raw
61
+ # Remove nodegroups section for initial cluster creation
62
+ $ClusterOnlyYaml = $ClusterYaml -replace '(?s)(managedNodeGroups:.*)', ''
63
+ $ClusterOnlyYaml | Out-File -FilePath $TempClusterConfig -Encoding utf8
64
+ eksctl create cluster -f $TempClusterConfig
65
+ Remove-Item $TempClusterConfig -Force
66
+ Write-Host "Cluster created." -ForegroundColor Green
67
+ }
68
+
69
+ aws eks update-kubeconfig --name $ClusterName --region $Region
70
+ Write-Host "kubeconfig updated."
71
+
72
+ # --- Phase 1b: Create Nodegroup Separately ---
73
+ Write-Host ""
74
+ Write-Host ">>> Phase 1b: Creating nodegroup (separate step to avoid timeout)..." -ForegroundColor Yellow
75
+
76
+ $nodegroupExists = $false
77
+ try {
78
+ eksctl get nodegroup --cluster $ClusterName --region $Region 2>$null | Select-String "linux-nodes" | Out-Null
79
+ $nodegroupExists = $true
80
+ } catch {}
81
+
82
+ if ($nodegroupExists) {
83
+ Write-Host "Nodegroup already exists, skipping creation."
84
+ } else {
85
+ # Create nodegroup separately (better error handling, can retry)
86
+ eksctl create nodegroup --config-file (Join-Path $AwsDir "eksctl-cluster.yaml")
87
+ Write-Host "Nodegroup created." -ForegroundColor Green
88
+ }
89
+
90
+ # Verify nodes are ready
91
+ Write-Host "Waiting for nodes to be ready..."
92
+ $nodesReady = $false
93
+ for ($i = 0; $i -lt 30; $i++) {
94
+ $nodes = kubectl get nodes --no-headers 2>$null
95
+ if ($nodes) {
96
+ Write-Host "Nodes ready:" -ForegroundColor Green
97
+ kubectl get nodes
98
+ $nodesReady = $true
99
+ break
100
+ }
101
+ Start-Sleep -Seconds 10
102
+ }
103
+ if (-not $nodesReady) {
104
+ Write-Host "WARNING: Nodes not ready yet. Check with: kubectl get nodes" -ForegroundColor Yellow
105
+ }
106
+
107
+ Write-Host "Enabling Prefix Delegation on VPC CNI..."
108
+ kubectl set env daemonset aws-node -n kube-system ENABLE_PREFIX_DELEGATION=true
109
+ Write-Host "Prefix Delegation enabled."
110
+
111
+ # --- Phase 2: Deploy Sample Workloads ---
112
+ Write-Host ""
113
+ Write-Host ">>> Phase 2: Deploying sample workloads (payments, checkout, catalog, cart, auth)..." -ForegroundColor Yellow
114
+ kubectl apply -f (Join-Path $AwsDir "k8s-workloads.yaml")
115
+ Write-Host "Workloads deployed." -ForegroundColor Green
116
+ kubectl get pods -n prod-sre
117
+
118
+ # --- Phase 3: Create AMP Workspace ---
119
+ Write-Host ""
120
+ Write-Host ">>> Phase 3: Creating Amazon Managed Prometheus workspace..." -ForegroundColor Yellow
121
+
122
+ $AmpWsId = $null
123
+ try {
124
+ $AmpWsId = aws amp list-workspaces --alias antiatropos-metrics --region $Region --query 'workspaces[0].workspaceId' --output text 2>$null
125
+ if ($AmpWsId -eq "None") { $AmpWsId = $null }
126
+ } catch {}
127
+
128
+ if ([string]::IsNullOrWhiteSpace($AmpWsId)) {
129
+ $AmpWsId = aws amp create-workspace `
130
+ --alias antiatropos-metrics `
131
+ --region $Region `
132
+ --query 'workspaceId' `
133
+ --output text
134
+ Write-Host "AMP workspace created: $AmpWsId" -ForegroundColor Green
135
+ } else {
136
+ Write-Host "AMP workspace already exists: $AmpWsId"
137
+ }
138
+
139
+ $AmpUrl = "https://aps-workspaces.$Region.amazonaws.com/workspaces/$AmpWsId"
140
+ Write-Host "AMP URL: $AmpUrl"
141
+
142
+ # --- Phase 4: Set up IRSA for Prometheus Agent ---
143
+ Write-Host ""
144
+ Write-Host ">>> Phase 4: Setting up IRSA for Prometheus Agent..." -ForegroundColor Yellow
145
+
146
+ $saExists = $false
147
+ try {
148
+ kubectl get serviceaccount prometheus-sa -n monitoring 2>$null | Out-Null
149
+ $saExists = $true
150
+ } catch {}
151
+
152
+ if ($saExists) {
153
+ Write-Host "prometheus-sa already exists."
154
+ } else {
155
+ eksctl create iamserviceaccount `
156
+ --cluster $ClusterName `
157
+ --namespace monitoring `
158
+ --name prometheus-sa `
159
+ --attach-policy-arn "arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess" `
160
+ --approve `
161
+ --override-existing-serviceaccounts
162
+ Write-Host "prometheus-sa created." -ForegroundColor Green
163
+ }
164
+
165
+ # --- Phase 5: Install Prometheus Agent ---
166
+ Write-Host ""
167
+ Write-Host ">>> Phase 5: Installing Prometheus Agent (remote-writes to AMP)..." -ForegroundColor Yellow
168
+
169
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>$null
170
+ helm repo update
171
+
172
+ $agentInstalled = $false
173
+ try {
174
+ helm status prometheus-agent -n monitoring 2>$null | Out-Null
175
+ $agentInstalled = $true
176
+ } catch {}
177
+
178
+ $promValuesYaml = Join-Path $AwsDir "prometheus-agent-values.yaml"
179
+ $remoteWriteUrl = "$AmpUrl/api/v1/remote_write"
180
+
181
+ if ($agentInstalled) {
182
+ Write-Host "prometheus-agent already installed, upgrading..."
183
+ helm upgrade prometheus-agent prometheus-community/prometheus `
184
+ --namespace monitoring `
185
+ -f $promValuesYaml `
186
+ --set "prometheus.prometheusSpec.remoteWrite[0].url=$remoteWriteUrl"
187
+ } else {
188
+ helm install prometheus-agent prometheus-community/prometheus `
189
+ --namespace monitoring --create-namespace `
190
+ -f $promValuesYaml `
191
+ --set "prometheus.prometheusSpec.remoteWrite[0].url=$remoteWriteUrl"
192
+ Write-Host "prometheus-agent installed." -ForegroundColor Green
193
+ }
194
+
195
+ # --- Phase 6: Install Self-Hosted Grafana on EKS ---
196
+ Write-Host ""
197
+ Write-Host ">>> Phase 6: Installing self-hosted Grafana on EKS..." -ForegroundColor Yellow
198
+
199
+ # Add Grafana Helm repo
200
+ helm repo add grafana https://grafana.github.io/helm-charts 2>$null
201
+ helm repo update
202
+
203
+ # Create a secret with the dashboard JSON files for Grafana to import
204
+ $DashboardsDir = Join-Path $PSScriptRoot "..\..\grafana\provisioning\dashboards\json"
205
+ if (Test-Path $DashboardsDir) {
206
+ Write-Host "Creating dashboard secret from $DashboardsDir..."
207
+ kubectl create secret generic antiatropos-grafana-dashboards `
208
+ --from-file=antiatropos-overview.json=$(Join-Path $DashboardsDir "antiatropos-overview.json") `
209
+ --from-file=antiatropos-live.json=$(Join-Path $DashboardsDir "antiatropos-live.json") `
210
+ --namespace monitoring `
211
+ --dry-run=client -o yaml | kubectl apply -f -
212
+ Write-Host "Dashboard secret created." -ForegroundColor Green
213
+ } else {
214
+ Write-Host "Dashboard JSON directory not found at $DashboardsDir, skipping."
215
+ }
216
+
217
+ # Install Grafana
218
+ $GrafanaValuesYaml = Join-Path $AwsDir "grafana-values.yaml"
219
+
220
+ if (helm status grafana -n monitoring 2>$null) {
221
+ Write-Host "Grafana already installed, upgrading..."
222
+ helm upgrade grafana grafana/grafana --namespace monitoring -f $GrafanaValuesYaml
223
+ } else {
224
+ helm install grafana grafana/grafana --namespace monitoring -f $GrafanaValuesYaml
225
+ Write-Host "Grafana installed." -ForegroundColor Green
226
+ }
227
+
228
+ # Wait for Grafana pod to be ready
229
+ Write-Host "Waiting for Grafana pod to be ready..."
230
+ kubectl rollout status deployment/grafana --namespace monitoring --timeout=120s 2>$null | Out-Null
231
+
232
+ $GrafanaPod = kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}' 2>$null
233
+ Write-Host "Grafana pod: $GrafanaPod"
234
+ Write-Host "To access Grafana: kubectl port-forward svc/grafana 3000 -n monitoring" -ForegroundColor Yellow
235
+ Write-Host "Login: admin / antiatropos"
236
+
237
+ # --- Phase 7: Install Cluster Autoscaler ---
238
+ Write-Host ""
239
+ Write-Host ">>> Phase 7: Installing Cluster Autoscaler..." -ForegroundColor Yellow
240
+
241
+ helm repo add autoscaler https://kubernetes.github.io/autoscaler 2>$null
242
+ helm repo update
243
+
244
+ $autoscalerInstalled = $false
245
+ try {
246
+ helm status cluster-autoscaler -n kube-system 2>$null | Out-Null
247
+ $autoscalerInstalled = $true
248
+ } catch {}
249
+
250
+ $autoscalerValues = Join-Path $AwsDir "cluster-autoscaler-values.yaml"
251
+
252
+ if ($autoscalerInstalled) {
253
+ Write-Host "cluster-autoscaler already installed, upgrading..."
254
+ helm upgrade cluster-autoscaler autoscaler/cluster-autoscaler `
255
+ --namespace kube-system `
256
+ -f $autoscalerValues
257
+ } else {
258
+ helm install cluster-autoscaler autoscaler/cluster-autoscaler `
259
+ --namespace kube-system `
260
+ -f $autoscalerValues
261
+ Write-Host "cluster-autoscaler installed." -ForegroundColor Green
262
+ }
263
+
264
+ # --- Phase 8: Generate Kubeconfig for HF Spaces ---
265
+ Write-Host ""
266
+ Write-Host ">>> Phase 8: Generating kubeconfig for HF Spaces..." -ForegroundColor Yellow
267
+
268
+ $generateScript = Join-Path $AwsDir "generate-kubeconfig.ps1"
269
+ if (Test-Path $generateScript) {
270
+ & $generateScript
271
+ } else {
272
+ # Inline kubeconfig generation if the .ps1 version doesn't exist yet
273
+ $output = Join-Path $AwsDir "kubeconfig-antiatropos.yaml"
274
+
275
+ # Verify cluster exists
276
+ $clusterCheck = $false
277
+ try {
278
+ eksctl get cluster --name $ClusterName --region $Region 2>$null | Out-Null
279
+ $clusterCheck = $true
280
+ } catch {}
281
+ if (-not $clusterCheck) {
282
+ Write-Host "ERROR: Cluster $ClusterName not found." -ForegroundColor Red
283
+ exit 1
284
+ }
285
+
286
+ $ClusterEndpoint = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.endpoint' --output text
287
+ $ClusterCa = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.certificateAuthority.data' --output text
288
+ $Timestamp = (Get-Date).ToUniversalTime().ToString("yyyy-MM-ddTHH:mm:ssZ")
289
+
290
+ $kubeconfig = @"
291
+ # Kubeconfig for AntiAtropos on Hugging Face Spaces
292
+ # Generated: $Timestamp
293
+ # Cluster: $ClusterName
294
+ # Region: $Region
295
+ #
296
+ # This kubeconfig uses AWS IAM authenticator.
297
+ # The HF Space container must have aws-cli available,
298
+ # OR the kubernetes Python client must be configured with AWS credentials.
299
+
300
+ apiVersion: v1
301
+ kind: Config
302
+ clusters:
303
+ - cluster:
304
+ certificate-authority-data: $ClusterCa
305
+ server: $ClusterEndpoint
306
+ name: $ClusterName
307
+
308
+ contexts:
309
+ - context:
310
+ cluster: $ClusterName
311
+ user: antiatropos-hf-user
312
+ name: $ClusterName
313
+
314
+ current-context: $ClusterName
315
+
316
+ preferences: {}
317
+
318
+ users:
319
+ - name: antiatropos-hf-user
320
+ user:
321
+ exec:
322
+ apiVersion: client.authentication.k8s.io/v1beta1
323
+ command: aws
324
+ args:
325
+ - eks
326
+ - get-token
327
+ - --region
328
+ - $Region
329
+ - --cluster-name
330
+ - $ClusterName
331
+ env:
332
+ - name: AWS_STS_REGIONAL_ENDPOINTS
333
+ value: regional
334
+ - name: AWS_DEFAULT_REGION
335
+ value: $Region
336
+ interactiveMode: IfAvailable
337
+ "@
338
+
339
+ $kubeconfig | Out-File -FilePath $output -Encoding utf8 -Force
340
+ Write-Host "Kubeconfig written to: $output" -ForegroundColor Green
341
+ Write-Host ""
342
+ Write-Host "To encode for HF Spaces secret:" -ForegroundColor Yellow
343
+ Write-Host " [Convert]::ToBase64String([System.IO.File]::ReadAllBytes('$output'))"
344
+ }
345
+
346
+ # --- Done ---
347
+ Write-Host ""
348
+ Write-Host "==========================================" -ForegroundColor Cyan
349
+ Write-Host " AntiAtropos AWS Infrastructure Ready!" -ForegroundColor Cyan
350
+ Write-Host "==========================================" -ForegroundColor Cyan
351
+ Write-Host ""
352
+ Write-Host "AMP Workspace ID: $AmpWsId"
353
+ Write-Host "AMP URL: $AmpUrl"
354
+ Write-Host ""
355
+ Write-Host "Grafana: Self-hosted on EKS (monitoring namespace)"
356
+ Write-Host " Access: kubectl port-forward svc/grafana 3000 -n monitoring"
357
+ Write-Host " Login: admin / antiatropos"
358
+ Write-Host " URL: http://localhost:3000"
359
+ Write-Host ""
360
+ Write-Host "Kubeconfig saved: $(Join-Path $AwsDir 'kubeconfig-antiatropos.yaml')"
361
+ Write-Host ""
362
+ Write-Host "Next steps - configure your HF Space:" -ForegroundColor Yellow
363
+ Write-Host " 1. Set secret KUBECONFIG_CONTENT = base64 of kubeconfig-antiatropos.yaml"
364
+ Write-Host " 2. Set env var PROMETHEUS_URL = $AmpUrl"
365
+ Write-Host " 3. Set env var KUBECONFIG = /app/kubeconfig.yaml"
366
+ Write-Host " 4. Set env var ANTIATROPOS_ENV_MODE = live"
367
+ Write-Host " 5. Set env var ANTIATROPOS_MAX_REPLICAS = 6"
368
+ Write-Host " 6. Set env var ANTIATROPOS_WORKLOAD_MAP = (see OPERATIONS.md)"
369
+ Write-Host " 7. Add kubeconfig decode to deploy/entrypoint.sh (see OPERATIONS.md)"
deploy/aws/deploy.sh ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # AntiAtropos AWS Infrastructure Deploy Script
3
+ #
4
+ # Deploys: EKS cluster, sample workloads, AMP workspace, Prometheus Agent,
5
+ # AMG workspace, Cluster Autoscaler, and generates kubeconfig for HF Spaces.
6
+ #
7
+ # The AntiAtropos FastAPI server runs on Hugging Face Spaces, NOT on AWS.
8
+ # This script only sets up the infrastructure that HF Spaces connects to.
9
+ #
10
+ # Prerequisites: aws cli, eksctl, kubectl, helm
11
+ #
12
+ # Usage:
13
+ # chmod +x deploy/aws/deploy.sh
14
+ # ./deploy/aws/deploy.sh
15
+ #
16
+ # Environment variables:
17
+ # AWS_REGION - AWS region (default: ap-south-1)
18
+ # CLUSTER_NAME - EKS cluster name (default: antiatropos)
19
+
20
+ set -euo pipefail
21
+
22
+ REGION="${AWS_REGION:-ap-south-1}"
23
+ CLUSTER_NAME="${CLUSTER_NAME:-antiatropos}"
24
+ AWS_DIR="$(cd "$(dirname "$0")" && pwd)"
25
+
26
+ echo "=== AntiAtropos AWS Infrastructure Deployment ==="
27
+ echo "Region: $REGION"
28
+ echo "Cluster: $CLUSTER_NAME"
29
+ echo "FastAPI: Runs on HF Spaces (not deployed here)"
30
+ echo ""
31
+
32
+ # --- Check prerequisites ---
33
+ for cmd in aws eksctl kubectl helm; do
34
+ if ! command -v "$cmd" &>/dev/null; then
35
+ echo "ERROR: $cmd is not installed. Please install it first."
36
+ exit 1
37
+ fi
38
+ done
39
+
40
+ # --- Phase 1: Create EKS Cluster ---
41
+ echo ""
42
+ echo ">>> Phase 1: Creating EKS cluster..."
43
+ if eksctl get cluster --name "$CLUSTER_NAME" --region "$REGION" &>/dev/null; then
44
+ echo "Cluster $CLUSTER_NAME already exists, skipping creation."
45
+ else
46
+ eksctl create cluster -f "$AWS_DIR/eksctl-cluster.yaml"
47
+ echo "Cluster created."
48
+ fi
49
+
50
+ aws eks update-kubeconfig --name "$CLUSTER_NAME" --region "$REGION"
51
+ echo "kubeconfig updated."
52
+
53
+ # --- Phase 2: Deploy Sample Workloads ---
54
+ echo ""
55
+ echo ">>> Phase 2: Deploying sample workloads (payments, checkout, catalog, cart, auth)..."
56
+ kubectl apply -f "$AWS_DIR/k8s-workloads.yaml"
57
+ echo "Workloads deployed."
58
+ kubectl get pods -n prod-sre
59
+
60
+ # --- Phase 3: Create AMP Workspace ---
61
+ echo ""
62
+ echo ">>> Phase 3: Creating Amazon Managed Prometheus workspace..."
63
+ AMP_WS_ID=$(aws amp list-workspaces --alias antiatropos-metrics --region "$REGION" --query 'workspaces[0].workspaceId' --output text 2>/dev/null || echo "")
64
+
65
+ if [ -z "$AMP_WS_ID" ] || [ "$AMP_WS_ID" = "None" ]; then
66
+ AMP_WS_ID=$(aws amp create-workspace \
67
+ --alias antiatropos-metrics \
68
+ --region "$REGION" \
69
+ --query 'workspaceId' \
70
+ --output text)
71
+ echo "AMP workspace created: $AMP_WS_ID"
72
+ else
73
+ echo "AMP workspace already exists: $AMP_WS_ID"
74
+ fi
75
+
76
+ AMP_URL="https://aps-workspaces.$REGION.amazonaws.com/workspaces/$AMP_WS_ID"
77
+ echo "AMP URL: $AMP_URL"
78
+
79
+ # --- Phase 4: Set up IRSA for Prometheus Agent ---
80
+ echo ""
81
+ echo ">>> Phase 4: Setting up IRSA for Prometheus Agent..."
82
+ if kubectl get serviceaccount prometheus-sa -n monitoring &>/dev/null; then
83
+ echo "prometheus-sa already exists."
84
+ else
85
+ eksctl create iamserviceaccount \
86
+ --cluster "$CLUSTER_NAME" \
87
+ --namespace monitoring \
88
+ --name prometheus-sa \
89
+ --attach-policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess \
90
+ --approve \
91
+ --override-existing-serviceaccounts
92
+ echo "prometheus-sa created."
93
+ fi
94
+
95
+ # --- Phase 5: Install Prometheus Agent ---
96
+ echo ""
97
+ echo ">>> Phase 5: Installing Prometheus Agent (remote-writes to AMP)..."
98
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts 2>/dev/null || true
99
+ helm repo update
100
+
101
+ if helm status prometheus-agent -n monitoring &>/dev/null; then
102
+ echo "prometheus-agent already installed, upgrading..."
103
+ helm upgrade prometheus-agent prometheus-community/prometheus \
104
+ --namespace monitoring \
105
+ -f "$AWS_DIR/prometheus-agent-values.yaml" \
106
+ --set "prometheus.prometheusSpec.remoteWrite[0].url=$AMP_URL/api/v1/remote_write"
107
+ else
108
+ helm install prometheus-agent prometheus-community/prometheus \
109
+ --namespace monitoring --create-namespace \
110
+ -f "$AWS_DIR/prometheus-agent-values.yaml" \
111
+ --set "prometheus.prometheusSpec.remoteWrite[0].url=$AMP_URL/api/v1/remote_write"
112
+ echo "prometheus-agent installed."
113
+ fi
114
+
115
+ # --- Phase 6: Install Self-Hosted Grafana on EKS ---
116
+ echo ""
117
+ echo ">>> Phase 6: Installing self-hosted Grafana on EKS..."
118
+
119
+ # Add Grafana Helm repo
120
+ helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true
121
+ helm repo update
122
+
123
+ # Create a secret with the dashboard JSON files for Grafana to import
124
+ DASHBOARDS_DIR="$AWS_DIR/../../grafana/provisioning/dashboards/json"
125
+ if [ -d "$DASHBOARDS_DIR" ]; then
126
+ echo "Creating dashboard secret from $DASHBOARDS_DIR..."
127
+ kubectl create secret generic antiatropos-grafana-dashboards \
128
+ --from-file=antiatropos-overview.json="$DASHBOARDS_DIR/antiatropos-overview.json" \
129
+ --from-file=antiatropos-live.json="$DASHBOARDS_DIR/antiatropos-live.json" \
130
+ --namespace monitoring \
131
+ --dry-run=client -o yaml | kubectl apply -f -
132
+ echo "Dashboard secret created."
133
+ else
134
+ echo "Dashboard JSON directory not found at $DASHBOARDS_DIR, skipping."
135
+ fi
136
+
137
+ # Install Grafana
138
+ GRAFANA_VALUES="$AWS_DIR/grafana-values.yaml"
139
+
140
+ if helm status grafana -n monitoring &>/dev/null; then
141
+ echo "Grafana already installed, upgrading..."
142
+ helm upgrade grafana grafana/grafana --namespace monitoring -f "$GRAFANA_VALUES"
143
+ else
144
+ helm install grafana grafana/grafana --namespace monitoring -f "$GRAFANA_VALUES"
145
+ echo "Grafana installed."
146
+ fi
147
+
148
+ # Wait for Grafana pod to be ready
149
+ echo "Waiting for Grafana pod to be ready..."
150
+ kubectl rollout status deployment/grafana --namespace monitoring --timeout=120s 2>/dev/null || true
151
+
152
+ GRAFANA_POD=$(kubectl get pods -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
153
+ echo "Grafana pod: $GRAFANA_POD"
154
+ echo "To access Grafana: kubectl port-forward svc/grafana 3000 -n monitoring"
155
+ echo "Login: admin / antiatropos"
156
+
157
+ # --- Phase 7: Install Cluster Autoscaler ---
158
+ echo ""
159
+ echo ">>> Phase 7: Installing Cluster Autoscaler..."
160
+ helm repo add autoscaler https://kubernetes.github.io/autoscaler 2>/dev/null || true
161
+ helm repo update
162
+
163
+ if helm status cluster-autoscaler -n kube-system &>/dev/null; then
164
+ echo "cluster-autoscaler already installed, upgrading..."
165
+ helm upgrade cluster-autoscaler autoscaler/cluster-autoscaler \
166
+ --namespace kube-system \
167
+ -f "$AWS_DIR/cluster-autoscaler-values.yaml"
168
+ else
169
+ helm install cluster-autoscaler autoscaler/cluster-autoscaler \
170
+ --namespace kube-system \
171
+ -f "$AWS_DIR/cluster-autoscaler-values.yaml"
172
+ echo "cluster-autoscaler installed."
173
+ fi
174
+
175
+ # --- Phase 8: Generate Kubeconfig for HF Spaces ---
176
+ echo ""
177
+ echo ">>> Phase 8: Generating kubeconfig for HF Spaces..."
178
+ "$AWS_DIR/generate-kubeconfig.sh"
179
+
180
+ # --- Done ---
181
+ echo ""
182
+ echo "=========================================="
183
+ echo " AntiAtropos AWS Infrastructure Ready!"
184
+ echo "=========================================="
185
+ echo ""
186
+ echo "AMP Workspace ID: $AMP_WS_ID"
187
+ echo "AMP URL: $AMP_URL"
188
+ echo ""
189
+ echo "Grafana: Self-hosted on EKS (monitoring namespace)"
190
+ echo " Access: kubectl port-forward svc/grafana 3000 -n monitoring"
191
+ echo " Login: admin / antiatropos"
192
+ echo " URL: http://localhost:3000"
193
+ echo ""
194
+ echo "Kubeconfig saved: $AWS_DIR/kubeconfig-antiatropos.yaml"
195
+ echo ""
196
+ echo "Next steps — configure your HF Space:"
197
+ echo " 1. Set secret KUBECONFIG_CONTENT = base64 of kubeconfig-antiatropos.yaml"
198
+ echo " 2. Set env var PROMETHEUS_URL = $AMP_URL"
199
+ echo " 3. Set env var KUBECONFIG = /app/kubeconfig.yaml"
200
+ echo " 4. Set env var ANTIATROPOS_ENV_MODE = live"
201
+ echo " 5. Set env var ANTIATROPOS_MAX_REPLICAS = 6"
202
+ echo " 6. Set env var ANTIATROPOS_WORKLOAD_MAP = (see OPERATIONS.md)"
203
+ echo " 7. Add kubeconfig decode to deploy/entrypoint.sh (see OPERATIONS.md)"
204
+
deploy/aws/eksctl-cluster.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: eksctl.io/v1alpha5
2
+ kind: ClusterConfig
3
+
4
+ metadata:
5
+ name: antiatropos
6
+ region: ap-south-1
7
+ version: "1.30"
8
+ tags:
9
+ Project: AntiAtropos
10
+ Environment: production
11
+
12
+ autoModeConfig:
13
+ enabled: false
14
+
15
+ iam:
16
+ withOIDC: true
17
+
18
+ addons:
19
+ - name: vpc-cni
20
+ version: latest
21
+ - name: coredns
22
+ version: latest
23
+ - name: kube-proxy
24
+ version: latest
25
+ - name: aws-ebs-csi-driver
26
+ version: latest
27
+ wellKnownPolicies:
28
+ ebsCSIController: true
29
+
30
+ managedNodeGroups:
31
+ - name: linux-nodes
32
+ instanceType: t3.micro
33
+ maxPodsPerNode: 110
34
+ desiredCapacity: 2
35
+ minSize: 1
36
+ maxSize: 4
37
+ volumeSize: 50
38
+ volumeType: gp3
39
+ availabilityZones:
40
+ - ap-south-1a
41
+ - ap-south-1b
42
+ labels:
43
+ role: worker
44
+ tags:
45
+ Project: AntiAtropos
46
+ NodeGroup: linux-nodes
47
+ iam:
48
+ withAddonPolicies:
49
+ ebs: true
50
+ cloudWatch: true
51
+ autoScaler: true
52
+
53
+ cloudWatch:
54
+ clusterLogging:
55
+ enableTypes:
56
+ - api
57
+ - audit
58
+ - authenticator
deploy/aws/generate-kubeconfig.ps1 ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generate a kubeconfig for HF Spaces to connect to the EKS cluster.
2
+ #
3
+ # This creates a kubeconfig that uses AWS IAM authenticator,
4
+ # which works from outside the cluster (like from HF Spaces).
5
+ #
6
+ # Prerequisites: aws cli, kubectl, eksctl
7
+ #
8
+ # Usage:
9
+ # .\deploy\aws\generate-kubeconfig.ps1
10
+ #
11
+ # Output:
12
+ # deploy/aws/kubeconfig-antiatropos.yaml
13
+ #
14
+ # Then on HF Spaces:
15
+ # 1. base64 encode: $b64 = [Convert]::ToBase64String([IO.File]::ReadAllBytes('deploy\aws\kubeconfig-antiatropos.yaml'))
16
+ # 2. Set as HF Space secret: KUBECONFIG_CONTENT = <base64 output>
17
+ # 3. Set env var: KUBECONFIG = /app/kubeconfig.yaml
18
+ # 4. Add to deploy/entrypoint.sh:
19
+ # if [ -n "${KUBECONFIG_CONTENT:-}" ]; then
20
+ # echo "${KUBECONFIG_CONTENT}" | base64 -d > /app/kubeconfig.yaml
21
+ # export KUBECONFIG=/app/kubeconfig.yaml
22
+ # fi
23
+
24
+ $ErrorActionPreference = "Stop"
25
+
26
+ $Region = if ($env:AWS_REGION) { $env:AWS_REGION } else { "ap-south-1" }
27
+ $ClusterName = if ($env:CLUSTER_NAME) { $env:CLUSTER_NAME } else { "antiatropos" }
28
+ $AwsDir = Split-Path -Parent $MyInvocation.MyCommand.Path
29
+ $Output = Join-Path $AwsDir "kubeconfig-antiatropos.yaml"
30
+
31
+ Write-Host ""
32
+ Write-Host "=== Generating kubeconfig for HF Spaces ===" -ForegroundColor Cyan
33
+ Write-Host "Cluster: $ClusterName"
34
+ Write-Host "Region: $Region"
35
+ Write-Host ""
36
+
37
+ # Verify cluster exists
38
+ $clusterExists = $false
39
+ try {
40
+ eksctl get cluster --name $ClusterName --region $Region 2>$null | Out-Null
41
+ $clusterExists = $true
42
+ } catch {}
43
+
44
+ if (-not $clusterExists) {
45
+ Write-Host "ERROR: Cluster $ClusterName not found. Create it first with eksctl." -ForegroundColor Red
46
+ exit 1
47
+ }
48
+
49
+ # Get cluster details
50
+ $ClusterEndpoint = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.endpoint' --output text
51
+ $ClusterCa = aws eks describe-cluster --name $ClusterName --region $Region --query 'cluster.certificateAuthority.data' --output text
52
+ $AwsArn = aws sts get-caller-identity --query Arn --output text
53
+ $Timestamp = (Get-Date).ToUniversalTime().ToString("yyyy-MM-ddTHH:mm:ssZ")
54
+
55
+ Write-Host "Cluster endpoint: $ClusterEndpoint"
56
+ Write-Host "AWS identity: $AwsArn"
57
+ Write-Host ""
58
+
59
+ # Generate the kubeconfig
60
+ $kubeconfig = @"
61
+ # Kubeconfig for AntiAtropos on Hugging Face Spaces
62
+ # Generated: $Timestamp
63
+ # Cluster: $ClusterName
64
+ # Region: $Region
65
+ #
66
+ # This kubeconfig uses AWS IAM authenticator.
67
+ # The HF Space container must have aws-cli and aws-iam-authenticator available,
68
+ # OR the kubernetes Python client must be configured with AWS credentials.
69
+ #
70
+ # To use this on HF Spaces:
71
+ # 1. base64 encode this file
72
+ # 2. Set as HF secret: KUBECONFIG_CONTENT = <base64>
73
+ # 3. Set env var: KUBECONFIG = /app/kubeconfig.yaml
74
+ # 4. Decode in entrypoint.sh before uvicorn starts
75
+
76
+ apiVersion: v1
77
+ kind: Config
78
+ clusters:
79
+ - cluster:
80
+ certificate-authority-data: $ClusterCa
81
+ server: $ClusterEndpoint
82
+ name: $ClusterName
83
+
84
+ contexts:
85
+ - context:
86
+ cluster: $ClusterName
87
+ user: antiatropos-hf-user
88
+ name: $ClusterName
89
+
90
+ current-context: $ClusterName
91
+
92
+ preferences: {}
93
+
94
+ users:
95
+ - name: antiatropos-hf-user
96
+ user:
97
+ exec:
98
+ apiVersion: client.authentication.k8s.io/v1beta1
99
+ command: aws
100
+ args:
101
+ - eks
102
+ - get-token
103
+ - --region
104
+ - $Region
105
+ - --cluster-name
106
+ - $ClusterName
107
+ env:
108
+ - name: AWS_STS_REGIONAL_ENDPOINTS
109
+ value: regional
110
+ - name: AWS_DEFAULT_REGION
111
+ value: $Region
112
+ interactiveMode: IfAvailable
113
+ "@
114
+
115
+ $kubeconfig | Out-File -FilePath $Output -Encoding utf8 -Force
116
+
117
+ Write-Host "Kubeconfig written to: $Output" -ForegroundColor Green
118
+ Write-Host ""
119
+ Write-Host "IMPORTANT: The HF Space container needs the AWS CLI and credentials" -ForegroundColor Yellow
120
+ Write-Host "to authenticate with EKS. You have two options:"
121
+ Write-Host ""
122
+ Write-Host "Option A: Include aws-cli in your Docker image and set AWS_ACCESS_KEY_ID /"
123
+ Write-Host " AWS_SECRET_ACCESS_KEY as HF Space secrets."
124
+ Write-Host ""
125
+ Write-Host "Option B: Use the kubernetes Python client with AWS SDK (boto3)."
126
+ Write-Host " The kubernetes_executor.py already supports this via"
127
+ Write-Host " load_kube_config() which uses the Python client's auth plugins."
128
+ Write-Host ""
129
+ Write-Host "To encode for HF Spaces secret:" -ForegroundColor Yellow
130
+ Write-Host " [Convert]::ToBase64String([IO.File]::ReadAllBytes('$Output'))"
131
+
deploy/aws/generate-kubeconfig.sh ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Generate a kubeconfig for HF Spaces to connect to the EKS cluster.
3
+ #
4
+ # This creates a kubeconfig that uses AWS IAM authenticator,
5
+ # which works from outside the cluster (like from HF Spaces).
6
+ #
7
+ # Prerequisites:
8
+ # - aws cli
9
+ # - kubectl
10
+ # - eksctl
11
+ # - The EKS cluster must already exist
12
+ #
13
+ # Usage:
14
+ # ./generate-kubeconfig.sh
15
+ #
16
+ # Output:
17
+ # deploy/aws/kubeconfig-antiatropos.yaml
18
+ #
19
+ # Then on HF Spaces:
20
+ # 1. base64 encode: cat kubeconfig-antiatropos.yaml | base64 -w 0
21
+ # 2. Set as HF Space secret: KUBECONFIG_CONTENT = <base64 output>
22
+ # 3. Set env var: KUBECONFIG = /app/kubeconfig.yaml
23
+ # 4. Add to deploy/entrypoint.sh:
24
+ # if [ -n "${KUBECONFIG_CONTENT:-}" ]; then
25
+ # echo "${KUBECONFIG_CONTENT}" | base64 -d > /app/kubeconfig.yaml
26
+ # export KUBECONFIG=/app/kubeconfig.yaml
27
+ # fi
28
+
29
+ set -euo pipefail
30
+
31
+ REGION="${AWS_REGION:-ap-south-1}"
32
+ CLUSTER_NAME="${CLUSTER_NAME:-antiatropos}"
33
+ AWS_DIR="$(cd "$(dirname "$0")" && pwd)"
34
+ OUTPUT="$AWS_DIR/kubeconfig-antiatropos.yaml"
35
+
36
+ echo "=== Generating kubeconfig for HF Spaces ==="
37
+ echo "Cluster: $CLUSTER_NAME"
38
+ echo "Region: $REGION"
39
+ echo ""
40
+
41
+ # Verify cluster exists
42
+ if ! eksctl get cluster --name "$CLUSTER_NAME" --region "$REGION" &>/dev/null; then
43
+ echo "ERROR: Cluster $CLUSTER_NAME not found. Create it first with eksctl."
44
+ exit 1
45
+ fi
46
+
47
+ # Get cluster details
48
+ CLUSTER_ENDPOINT=$(aws eks describe-cluster \
49
+ --name "$CLUSTER_NAME" \
50
+ --region "$REGION" \
51
+ --query 'cluster.endpoint' \
52
+ --output text)
53
+
54
+ CLUSTER_CA=$(aws eks describe-cluster \
55
+ --name "$CLUSTER_NAME" \
56
+ --region "$REGION" \
57
+ --query 'cluster.certificateAuthority.data' \
58
+ --output text)
59
+
60
+ # Get the current AWS identity for the kubeconfig
61
+ AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
62
+ AWS_ARN=$(aws sts get-caller-identity --query Arn --output text)
63
+
64
+ echo "Cluster endpoint: $CLUSTER_ENDPOINT"
65
+ echo "AWS identity: $AWS_ARN"
66
+ echo ""
67
+
68
+ # Generate the kubeconfig
69
+ cat > "$OUTPUT" <<EOF
70
+ # Kubeconfig for AntiAtropos on Hugging Face Spaces
71
+ # Generated: $(date -u +"%Y-%m-%dT%H:%M:%SZ")
72
+ # Cluster: $CLUSTER_NAME
73
+ # Region: $REGION
74
+ #
75
+ # This kubeconfig uses AWS IAM authenticator.
76
+ # The HF Space container must have aws-cli and aws-iam-authenticator available,
77
+ # OR the kubernetes Python client must be configured with AWS credentials.
78
+ #
79
+ # To use this on HF Spaces:
80
+ # 1. base64 encode this file: cat kubeconfig-antiatropos.yaml | base64 -w 0
81
+ # 2. Set as HF secret: KUBECONFIG_CONTENT = <base64>
82
+ # 3. Set env var: KUBECONFIG = /app/kubeconfig.yaml
83
+ # 4. Decode in entrypoint.sh before uvicorn starts
84
+
85
+ apiVersion: v1
86
+ kind: Config
87
+ clusters:
88
+ - cluster:
89
+ certificate-authority-data: $CLUSTER_CA
90
+ server: $CLUSTER_ENDPOINT
91
+ name: $CLUSTER_NAME
92
+
93
+ contexts:
94
+ - context:
95
+ cluster: $CLUSTER_NAME
96
+ user: antiatropos-hf-user
97
+ name: $CLUSTER_NAME
98
+
99
+ current-context: $CLUSTER_NAME
100
+
101
+ preferences: {}
102
+
103
+ users:
104
+ - name: antiatropos-hf-user
105
+ user:
106
+ exec:
107
+ apiVersion: client.authentication.k8s.io/v1beta1
108
+ command: aws
109
+ args:
110
+ - eks
111
+ - token
112
+ - --region
113
+ - $REGION
114
+ - --cluster-name
115
+ - $CLUSTER_NAME
116
+ env:
117
+ - name: AWS_STS_REGIONAL_ENDPOINTS
118
+ value: regional
119
+ - name: AWS_DEFAULT_REGION
120
+ value: $REGION
121
+ interactiveMode: IfAvailable
122
+ EOF
123
+
124
+ echo "Kubeconfig written to: $OUTPUT"
125
+ echo ""
126
+ echo "IMPORTANT: The HF Space container needs the AWS CLI and credentials"
127
+ echo "to authenticate with EKS. You have two options:"
128
+ echo ""
129
+ echo "Option A: Include aws-cli in your Docker image and set AWS_ACCESS_KEY_ID /"
130
+ echo " AWS_SECRET_ACCESS_KEY as HF Space secrets."
131
+ echo ""
132
+ echo "Option B: Use the kubernetes Python client with AWS SDK (boto3)."
133
+ echo " The kubernetes_executor.py already supports this via"
134
+ echo " load_kube_config() which uses the Python client's auth plugins."
135
+ echo ""
136
+ echo "To encode for HF Spaces secret:"
137
+ echo " cat $OUTPUT | base64 -w 0"
138
+
deploy/aws/grafana-trust-policy.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Version": "2012-10-17",
3
+ "Statement": [
4
+ {
5
+ "Effect": "Allow",
6
+ "Principal": {
7
+ "Service": "grafana.amazonaws.com"
8
+ },
9
+ "Action": "sts:AssumeRole"
10
+ }
11
+ ]
12
+ }
deploy/aws/grafana-values.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Grafana self-hosted on EKS
2
+ # Connects to the local Prometheus agent and imports AntiAtropos dashboards
3
+
4
+ replicaCount: 1
5
+
6
+ adminUser: admin
7
+ adminPassword: antiatropos
8
+
9
+ service:
10
+ type: ClusterIP
11
+ port: 80
12
+
13
+ persistence:
14
+ enabled: true
15
+ size: 5Gi
16
+ storageClassName: gp2
17
+
18
+ # Use the local Prometheus agent as data source
19
+ additionalDataSources:
20
+ - name: AMP-Local
21
+ type: prometheus
22
+ access: proxy
23
+ url: http://prometheus-agent-server.monitoring.svc.cluster.local:80
24
+ isDefault: true
25
+ editable: true
26
+
27
+ # Import AntiAtropos dashboards
28
+ dashboardProviders:
29
+ dashboardproviders.yaml:
30
+ apiVersion: 1
31
+ providers:
32
+ - name: 'default'
33
+ orgId: 1
34
+ folder: 'AntiAtropos'
35
+ type: file
36
+ disableDeletion: false
37
+ editable: true
38
+ options:
39
+ path: /var/lib/grafana/dashboards
40
+
41
+ dashboards:
42
+ default:
43
+ antiatropos-overview:
44
+ gnetId: null
45
+ datasource: AMP-Local
46
+ antiatropos-live:
47
+ gnetId: null
48
+ datasource: AMP-Local
49
+
50
+ # Allow dashboard JSON files to be mounted
51
+ extraSecretMounts:
52
+ - name: dashboards
53
+ mountPath: /var/lib/grafana/dashboards
54
+ subPath: ""
55
+ secretName: antiatropos-grafana-dashboards
56
+ readOnly: true
57
+
58
+ resources:
59
+ limits:
60
+ memory: 512Mi
61
+ cpu: 250m
62
+ requests:
63
+ memory: 256Mi
64
+ cpu: 100m
65
+
66
+ nodeSelector: {}
67
+ tolerations: []
68
+ affinity: {}
deploy/aws/k8s-workloads.yaml ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sample microservice deployments for AntiAtropos SRE training.
2
+ #
3
+ # These are the workloads the SRE agent will SCALE_UP / SCALE_DOWN / REROUTE_TRAFFIC / SHED_LOAD.
4
+ # Each maps to a simulator node via ANTIATROPOS_WORKLOAD_MAP on HF Spaces.
5
+ #
6
+ # Apply: kubectl apply -f k8s-workloads.yaml
7
+ #
8
+ # The Prometheus Agent (in monitoring namespace) scrapes these pods
9
+ # because they have the prometheus.io/scrape annotation.
10
+ # Metrics are remote-written to AMP where the AntiAtropos server on HF Spaces queries them.
11
+
12
+ ---
13
+ apiVersion: v1
14
+ kind: Namespace
15
+ metadata:
16
+ name: prod-sre
17
+ labels:
18
+ app.kubernetes.io/part-of: antiatropos
19
+
20
+ ---
21
+ # ResourceQuota: Hard cap on pods in prod-sre namespace.
22
+ # This is a Kubernetes-level safety net. Even if the agent's Python cap fails,
23
+ # Kubernetes will refuse to create pods beyond this limit.
24
+ #
25
+ # Max 30 pods = 6 replicas x 5 deployments (our worst-case budget)
26
+ # Max 8 CPU / 8GB RAM = enough for 30 small nginx pods
27
+ apiVersion: v1
28
+ kind: ResourceQuota
29
+ metadata:
30
+ name: prod-sre-quota
31
+ namespace: prod-sre
32
+ spec:
33
+ hard:
34
+ pods: "30"
35
+ requests.cpu: "8"
36
+ requests.memory: 8Gi
37
+ limits.cpu: "15"
38
+ limits.memory: 15Gi
39
+
40
+ ---
41
+ # payments — node-0 (VIP)
42
+ # Business-critical payment service. Always has 2 replicas for redundancy.
43
+ # The SRE agent should never SHED_LOAD on this (CRITICAL_NODES in simulator.py).
44
+ apiVersion: apps/v1
45
+ kind: Deployment
46
+ metadata:
47
+ name: payments
48
+ namespace: prod-sre
49
+ labels:
50
+ app: payments
51
+ node-id: node-0
52
+ critical: "true"
53
+ spec:
54
+ replicas: 2
55
+ selector:
56
+ matchLabels:
57
+ app: payments
58
+ template:
59
+ metadata:
60
+ labels:
61
+ app: payments
62
+ node-id: node-0
63
+ annotations:
64
+ prometheus.io/scrape: "true"
65
+ prometheus.io/port: "8080"
66
+ prometheus.io/path: "/metrics"
67
+ spec:
68
+ containers:
69
+ - name: payments
70
+ image: nginx:alpine
71
+ ports:
72
+ - containerPort: 80
73
+ resources:
74
+ requests:
75
+ cpu: 100m
76
+ memory: 64Mi
77
+ limits:
78
+ cpu: 250m
79
+ memory: 128Mi
80
+
81
+ ---
82
+ apiVersion: v1
83
+ kind: Service
84
+ metadata:
85
+ name: payments
86
+ namespace: prod-sre
87
+ spec:
88
+ selector:
89
+ app: payments
90
+ ports:
91
+ - port: 80
92
+ targetPort: 80
93
+
94
+ ---
95
+ # checkout — node-1
96
+ apiVersion: apps/v1
97
+ kind: Deployment
98
+ metadata:
99
+ name: checkout
100
+ namespace: prod-sre
101
+ labels:
102
+ app: checkout
103
+ node-id: node-1
104
+ spec:
105
+ replicas: 1
106
+ selector:
107
+ matchLabels:
108
+ app: checkout
109
+ template:
110
+ metadata:
111
+ labels:
112
+ app: checkout
113
+ node-id: node-1
114
+ annotations:
115
+ prometheus.io/scrape: "true"
116
+ prometheus.io/port: "8080"
117
+ prometheus.io/path: "/metrics"
118
+ spec:
119
+ containers:
120
+ - name: checkout
121
+ image: nginx:alpine
122
+ ports:
123
+ - containerPort: 80
124
+ resources:
125
+ requests:
126
+ cpu: 100m
127
+ memory: 64Mi
128
+ limits:
129
+ cpu: 250m
130
+ memory: 128Mi
131
+
132
+ ---
133
+ apiVersion: v1
134
+ kind: Service
135
+ metadata:
136
+ name: checkout
137
+ namespace: prod-sre
138
+ spec:
139
+ selector:
140
+ app: checkout
141
+ ports:
142
+ - port: 80
143
+ targetPort: 80
144
+
145
+ ---
146
+ # catalog — node-2
147
+ apiVersion: apps/v1
148
+ kind: Deployment
149
+ metadata:
150
+ name: catalog
151
+ namespace: prod-sre
152
+ labels:
153
+ app: catalog
154
+ node-id: node-2
155
+ spec:
156
+ replicas: 1
157
+ selector:
158
+ matchLabels:
159
+ app: catalog
160
+ template:
161
+ metadata:
162
+ labels:
163
+ app: catalog
164
+ node-id: node-2
165
+ annotations:
166
+ prometheus.io/scrape: "true"
167
+ prometheus.io/port: "8080"
168
+ prometheus.io/path: "/metrics"
169
+ spec:
170
+ containers:
171
+ - name: catalog
172
+ image: nginx:alpine
173
+ ports:
174
+ - containerPort: 80
175
+ resources:
176
+ requests:
177
+ cpu: 100m
178
+ memory: 64Mi
179
+ limits:
180
+ cpu: 250m
181
+ memory: 128Mi
182
+
183
+ ---
184
+ apiVersion: v1
185
+ kind: Service
186
+ metadata:
187
+ name: catalog
188
+ namespace: prod-sre
189
+ spec:
190
+ selector:
191
+ app: catalog
192
+ ports:
193
+ - port: 80
194
+ targetPort: 80
195
+
196
+ ---
197
+ # cart — node-3
198
+ apiVersion: apps/v1
199
+ kind: Deployment
200
+ metadata:
201
+ name: cart
202
+ namespace: prod-sre
203
+ labels:
204
+ app: cart
205
+ node-id: node-3
206
+ spec:
207
+ replicas: 1
208
+ selector:
209
+ matchLabels:
210
+ app: cart
211
+ template:
212
+ metadata:
213
+ labels:
214
+ app: cart
215
+ node-id: node-3
216
+ annotations:
217
+ prometheus.io/scrape: "true"
218
+ prometheus.io/port: "8080"
219
+ prometheus.io/path: "/metrics"
220
+ spec:
221
+ containers:
222
+ - name: cart
223
+ image: nginx:alpine
224
+ ports:
225
+ - containerPort: 80
226
+ resources:
227
+ requests:
228
+ cpu: 100m
229
+ memory: 64Mi
230
+ limits:
231
+ cpu: 250m
232
+ memory: 128Mi
233
+
234
+ ---
235
+ apiVersion: v1
236
+ kind: Service
237
+ metadata:
238
+ name: cart
239
+ namespace: prod-sre
240
+ spec:
241
+ selector:
242
+ app: cart
243
+ ports:
244
+ - port: 80
245
+ targetPort: 80
246
+
247
+ ---
248
+ # auth — node-4
249
+ apiVersion: apps/v1
250
+ kind: Deployment
251
+ metadata:
252
+ name: auth
253
+ namespace: prod-sre
254
+ labels:
255
+ app: auth
256
+ node-id: node-4
257
+ spec:
258
+ replicas: 1
259
+ selector:
260
+ matchLabels:
261
+ app: auth
262
+ template:
263
+ metadata:
264
+ labels:
265
+ app: auth
266
+ node-id: node-4
267
+ annotations:
268
+ prometheus.io/scrape: "true"
269
+ prometheus.io/port: "8080"
270
+ prometheus.io/path: "/metrics"
271
+ spec:
272
+ containers:
273
+ - name: auth
274
+ image: nginx:alpine
275
+ ports:
276
+ - containerPort: 80
277
+ resources:
278
+ requests:
279
+ cpu: 100m
280
+ memory: 64Mi
281
+ limits:
282
+ cpu: 250m
283
+ memory: 128Mi
284
+
285
+ ---
286
+ apiVersion: v1
287
+ kind: Service
288
+ metadata:
289
+ name: auth
290
+ namespace: prod-sre
291
+ spec:
292
+ selector:
293
+ app: auth
294
+ ports:
295
+ - port: 80
296
+ targetPort: 80
deploy/aws/kubeconfig-antiatropos.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: v1
2
+ kind: Config
3
+ clusters:
4
+ - cluster:
5
+ certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURCVENDQWUyZ0F3SUJBZ0lJQk5ZY1JYcVZ2dm93RFFZSktvWklodmNOQVFFTEJRQXdGVEVUTUJFR0ExVUUKQXhNS2EzVmlaWEp1WlhSbGN6QWVGdzB5TmpBME1qTXhPREEwTURGYUZ3MHpOakEwTWpBeE9EQTVNREZhTUJVeApFekFSQmdOVkJBTVRDbXQxWW1WeWJtVjBaWE13Z2dFaU1BMEdDU3FHU0liM0RRRUJBUVVBQTRJQkR3QXdnZ0VLCkFvSUJBUUN2cHYwRVRIREIxeVRjVVFxa21Xd2Z2YnE0Z3d1bm9HK0w0MkIvaUV0N3h1NVhTMjZQWVlwNURGckYKUTJoUTRndDlENDUwNXlHNkN0eCtWVXBncExpeUxEU3pMdEM2VHUrUm5uSEY0NHRHZ1NJQm9GaG9TaXhzWFV3SQoxU3E1NVBIeHhPQmo3OGJxRFVxL2R3eE1xOVk1TzBINmkwV1ZaZHMvTmhaMk9rd1dJeUJnYy9Rckhpb2ZJZm1qCkVhZ0psRm9Sb1c2L2RjajBiOThOMi9zaWt1blRhQldJSGpPay9ESkNiWldzU0JtOTBBY0V3dEdnN1Bhc1hOcUsKaWwydWxlMG9PYk9zTyszbDhpeU9nYktROHFDbFgwSU03UVN2Y1J4YnYwK2FCYXpxVS9BRkhMY1VmTW1VMXVKRwpLdGVuTUxzNnBLdlpyRU9EOFlacklkYmkyZDBaQWdNQkFBR2pXVEJYTUE0R0ExVWREd0VCL3dRRUF3SUNwREFQCkJnTlZIUk1CQWY4RUJUQURBUUgvTUIwR0ExVWREZ1FXQkJTVm43TWdjYkhCNE9wNFc0WEhLYlNPeWdBdDREQVYKQmdOVkhSRUVEakFNZ2dwcmRXSmxjbTVsZEdWek1BMEdDU3FHU0liM0RRRUJDd1VBQTRJQkFRQSsxc0Rjc1RJcQp0T3V4Nk5OMkUrTFlYOFEvTk9qWlhSQVhSeDlOdXhoL0RCRmJwTjUrTzg2VWROL3BJamI0WGUyTVRGaytCTXZnCmUyWk9NNGJFQTlLR3JPc1RhK3VBL3pKZFhjUXZ0MG00Kzd5T3VqcklHOGhuOTlZSjRlTmxYYk9nV3NOTmVDMnEKT01DVFFPdGtJNVlMNFNET2ZDRUlsOEpBU0QvZTNRd0p6Mk15bnNIR2F4azZYZ3VnVkgzekVQcVNRL3FZa2pQTgpDY0ZMNXF1WWVUODUzM3g0SENKb1dmblZReHlaOVJ2V1Y0eThpT3JqbTV3Z2xvN2U3NkRmaTBwTnczRS80MysxCisrdXdWYmhZZTE0OUhyK3FzWU1YbGFiTFJmeHhXT2RxdzMxbXdJeitSSHF5V2U4V3prZnhUVGlmQjZNVVJyQXgKRWVKQWkwdWMxSkRMCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
6
+ server: https://D3CBAF956940D075AE61BB6193A93256.gr7.ap-south-1.eks.amazonaws.com
7
+ name: antiatropos
8
+ contexts:
9
+ - context:
10
+ cluster: antiatropos
11
+ user: antiatropos-hf-user
12
+ name: antiatropos
13
+ current-context: antiatropos
14
+ preferences: {}
15
+ users:
16
+ - name: antiatropos-hf-user
17
+ user:
18
+ exec:
19
+ apiVersion: client.authentication.k8s.io/v1beta1
20
+ command: aws
21
+ args:
22
+ - eks
23
+ - get-token
24
+ - --region
25
+ - ap-south-1
26
+ - --cluster-name
27
+ - antiatropos
28
+ env:
29
+ - name: AWS_STS_REGIONAL_ENDPOINTS
30
+ value: regional
31
+ - name: AWS_DEFAULT_REGION
32
+ value: ap-south-1
33
+ interactiveMode: IfAvailable
34
+
deploy/aws/prometheus-agent-values.yaml ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Helm values for Prometheus Agent that remote-writes to Amazon Managed Prometheus
2
+ #
3
+ # Usage:
4
+ # helm install prometheus-agent prometheus-community/prometheus \
5
+ # --namespace monitoring --create-namespace \
6
+ # -f prometheus-agent-values.yaml \
7
+ # --set prometheus.prometheusSpec.remoteWrite[0].url="https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/WORKSPACE_ID/api/v1/remote_write"
8
+ #
9
+ # Prerequisite: Create an IAM service account for the prometheus pod
10
+ # eksctl create iamserviceaccount \
11
+ # --cluster antiatropos \
12
+ # --namespace monitoring \
13
+ # --name prometheus-sa \
14
+ # --attach-policy-arn arn:aws:iam::aws:policy/AmazonPrometheusRemoteWriteAccess \
15
+ # --approve
16
+
17
+ prometheus:
18
+ prometheusSpec:
19
+ # Run as agent mode (remote-write only, no local query API)
20
+ agentMode: true
21
+
22
+ # Remote write — override via --set on the command line
23
+ remoteWrite:
24
+ - url: "https://aps-workspaces.ap-south-1.amazonaws.com/workspaces/REPLACE_WORKSPACE_ID/api/v1/remote_write"
25
+ sigv4:
26
+ region: ap-south-1
27
+
28
+ # Scrape the workload pods in prod-sre namespace (the microservices
29
+ # the SRE agent manages: payments, checkout, catalog, cart, auth)
30
+ additionalScrapeConfigs:
31
+ - job_name: antiatropos-workloads
32
+ metrics_path: /metrics
33
+ scrape_interval: 15s
34
+ kubernetes_sd_configs:
35
+ - role: pod
36
+ namespaces:
37
+ names:
38
+ - prod-sre
39
+ relabel_configs:
40
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
41
+ action: keep
42
+ regex: true
43
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
44
+ action: replace
45
+ target_label: __metrics_path__
46
+ regex: (.+)
47
+ - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
48
+ action: replace
49
+ regex: ([^:]+)(?::\d+)?;(\d+)
50
+ replacement: $1:$2
51
+ target_label: __address__
52
+ - action: labelmap
53
+ regex: __meta_kubernetes_pod_label_(.+)
54
+ - source_labels: [__meta_kubernetes_namespace]
55
+ action: replace
56
+ target_label: namespace
57
+ - source_labels: [__meta_kubernetes_pod_name]
58
+ action: replace
59
+ target_label: pod
60
+
61
+ # Also scrape the Prometheus Agent's own metrics for monitoring
62
+ - job_name: prometheus-agent-self
63
+ scrape_interval: 15s
64
+ static_configs:
65
+ - targets:
66
+ - localhost:9090
67
+
68
+ resources:
69
+ requests:
70
+ cpu: 100m
71
+ memory: 256Mi
72
+ limits:
73
+ cpu: 500m
74
+ memory: 512Mi
75
+
76
+ # Short retention since we're remote-writing everything to AMP
77
+ retention: 2h
78
+
79
+ # Use the IAM service account for AMP authentication
80
+ serviceAccount:
81
+ name: prometheus-sa
82
+ create: false
83
+
84
+ # Disable alertmanager (AMP handles alerting if needed)
85
+ alertmanager:
86
+ enabled: false
87
+
88
+ # Disable pushgateway
89
+ pushgateway:
90
+ enabled: false
91
+
92
+ # Disable server (we only need the agent)
93
+ server:
94
+ enabled: false
95
+
deploy/aws/teardown-all.ps1 ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AntiAtropos - One-Run Teardown Script
2
+ # Deletes entire AWS infrastructure: EKS cluster, AMP workspace
3
+ #
4
+ # Usage: .\deploy\aws\teardown-all.ps1
5
+
6
+ $ErrorActionPreference = "Stop"
7
+
8
+ # In PowerShell 7+, prevent native stderr output from becoming terminating errors.
9
+ if (Get-Variable -Name PSNativeCommandUseErrorActionPreference -ErrorAction SilentlyContinue) {
10
+ $PSNativeCommandUseErrorActionPreference = $false
11
+ }
12
+
13
+ $Region = "ap-south-1"
14
+ $ClusterName = "antiatropos"
15
+ $AmpAlias = "antiatropos-metrics"
16
+ $GeneratedKubeconfig = Join-Path $PSScriptRoot "kubeconfig-antiatropos.yaml"
17
+
18
+ function Invoke-CheckedCommand {
19
+ param(
20
+ [ScriptBlock]$Command,
21
+ [string]$ErrorMessage
22
+ )
23
+
24
+ $previousErrorActionPreference = $ErrorActionPreference
25
+ $ErrorActionPreference = "Continue"
26
+ try {
27
+ & $Command
28
+ } finally {
29
+ $ErrorActionPreference = $previousErrorActionPreference
30
+ }
31
+
32
+ if ($LASTEXITCODE -ne 0) {
33
+ throw $ErrorMessage
34
+ }
35
+ }
36
+
37
+ function Get-EksClusterStatus {
38
+ param(
39
+ [string]$Name,
40
+ [string]$AwsRegion
41
+ )
42
+
43
+ try {
44
+ $status = aws eks describe-cluster --name $Name --region $AwsRegion --query 'cluster.status' --output text 2>$null
45
+ } catch {
46
+ return $null
47
+ }
48
+
49
+ if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($status) -or $status -eq "None") {
50
+ return $null
51
+ }
52
+
53
+ return $status.Trim()
54
+ }
55
+
56
+ function Get-EksNodegroups {
57
+ param(
58
+ [string]$Name,
59
+ [string]$AwsRegion
60
+ )
61
+
62
+ try {
63
+ $raw = aws eks list-nodegroups --cluster-name $Name --region $AwsRegion --query 'nodegroups' --output text 2>$null
64
+ } catch {
65
+ return @()
66
+ }
67
+
68
+ if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($raw) -or $raw -eq "None") {
69
+ return @()
70
+ }
71
+
72
+ return @($raw -split '\s+' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })
73
+ }
74
+
75
+ function Remove-ResidualEksStacks {
76
+ param(
77
+ [string]$Cluster,
78
+ [string]$AwsRegion
79
+ )
80
+
81
+ $stackPrefix = "eksctl-$Cluster"
82
+ $stackQuery = "StackSummaries[?starts_with(StackName, '$stackPrefix') && (StackStatus!='DELETE_COMPLETE' && StackStatus!='DELETE_IN_PROGRESS')].StackName"
83
+
84
+ $stacksText = aws cloudformation list-stacks --region $AwsRegion --query $stackQuery --output text 2>$null
85
+ if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($stacksText) -or $stacksText -eq "None") {
86
+ return
87
+ }
88
+
89
+ $stacks = @($stacksText -split '\s+' | Where-Object { -not [string]::IsNullOrWhiteSpace($_) })
90
+ foreach ($stack in $stacks) {
91
+ Write-Host "Deleting residual stack: $stack" -ForegroundColor Yellow
92
+ Invoke-CheckedCommand -Command { aws cloudformation delete-stack --stack-name $stack --region $AwsRegion 2>$null | Out-Null } -ErrorMessage "Failed to delete stack '$stack'"
93
+ Invoke-CheckedCommand -Command { aws cloudformation wait stack-delete-complete --stack-name $stack --region $AwsRegion } -ErrorMessage "Timed out deleting stack '$stack'"
94
+ }
95
+ }
96
+
97
+ function Get-AmpWorkspaceIdByAlias {
98
+ param(
99
+ [string]$Alias,
100
+ [string]$AwsRegion
101
+ )
102
+
103
+ try {
104
+ $id = aws amp list-workspaces --alias $Alias --region $AwsRegion --query 'workspaces[0].workspaceId' --output text 2>$null
105
+ } catch {
106
+ return $null
107
+ }
108
+
109
+ if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($id) -or $id -eq "None") {
110
+ return $null
111
+ }
112
+
113
+ return $id.Trim()
114
+ }
115
+
116
+ function Wait-AmpWorkspaceDeleted {
117
+ param(
118
+ [string]$WorkspaceId,
119
+ [string]$AwsRegion
120
+ )
121
+
122
+ for ($i = 0; $i -lt 30; $i++) {
123
+ try {
124
+ $status = aws amp describe-workspace --workspace-id $WorkspaceId --region $AwsRegion --query 'workspace.status.statusCode' --output text 2>$null
125
+ } catch {
126
+ return
127
+ }
128
+
129
+ if ($LASTEXITCODE -ne 0 -or [string]::IsNullOrWhiteSpace($status) -or $status -eq "None") {
130
+ return
131
+ }
132
+ Start-Sleep -Seconds 10
133
+ }
134
+
135
+ throw "AMP workspace '$WorkspaceId' deletion timed out"
136
+ }
137
+
138
+ Write-Host ""
139
+ Write-Host "==========================================" -ForegroundColor Red
140
+ Write-Host " AntiAtropos AWS Infrastructure Teardown" -ForegroundColor Red
141
+ Write-Host "==========================================" -ForegroundColor Red
142
+ Write-Host "Region: $Region"
143
+ Write-Host "Cluster: $ClusterName"
144
+ Write-Host ""
145
+
146
+ # --- Step 1: Delete EKS Cluster ---
147
+ Write-Host ">>> Step 1: Deleting EKS cluster..." -ForegroundColor Yellow
148
+
149
+ $clusterStatus = Get-EksClusterStatus -Name $ClusterName -AwsRegion $Region
150
+ if ($clusterStatus) {
151
+ Write-Host "Cluster status: $clusterStatus" -ForegroundColor Yellow
152
+
153
+ if ($clusterStatus -ne "DELETING") {
154
+ $nodegroups = Get-EksNodegroups -Name $ClusterName -AwsRegion $Region
155
+ foreach ($ng in $nodegroups) {
156
+ Write-Host "Deleting nodegroup: $ng" -ForegroundColor Yellow
157
+ $ngStatus = aws eks describe-nodegroup --cluster-name $ClusterName --nodegroup-name $ng --region $Region --query 'nodegroup.status' --output text 2>$null
158
+ if ($LASTEXITCODE -eq 0 -and $ngStatus -ne "DELETING") {
159
+ Invoke-CheckedCommand -Command { aws eks delete-nodegroup --cluster-name $ClusterName --nodegroup-name $ng --region $Region --output text 2>$null | Out-Null } -ErrorMessage "Failed to start deletion for nodegroup '$ng'"
160
+ } else {
161
+ Write-Host "Nodegroup '$ng' already deleting" -ForegroundColor Yellow
162
+ }
163
+
164
+ Write-Host "Waiting for nodegroup deletion: $ng" -ForegroundColor Yellow
165
+ Invoke-CheckedCommand -Command { aws eks wait nodegroup-deleted --cluster-name $ClusterName --nodegroup-name $ng --region $Region } -ErrorMessage "Timed out waiting for nodegroup '$ng' deletion"
166
+ Write-Host "OK: Nodegroup deleted: $ng" -ForegroundColor Green
167
+ }
168
+
169
+ Write-Host "Deleting cluster control plane..." -ForegroundColor Yellow
170
+ Invoke-CheckedCommand -Command { eksctl delete cluster --name $ClusterName --region $Region 2>$null | Out-Null } -ErrorMessage "Failed to delete EKS cluster"
171
+ } else {
172
+ Write-Host "Cluster is already deleting" -ForegroundColor Yellow
173
+ }
174
+
175
+ Write-Host "Waiting for cluster deletion..." -ForegroundColor Yellow
176
+ Invoke-CheckedCommand -Command { aws eks wait cluster-deleted --name $ClusterName --region $Region } -ErrorMessage "Timed out waiting for EKS cluster deletion"
177
+ Write-Host "OK: Cluster deleted" -ForegroundColor Green
178
+ } else {
179
+ Write-Host "OK: Cluster not found, skipping" -ForegroundColor Green
180
+ }
181
+
182
+ Write-Host "Checking for residual eksctl stacks..." -ForegroundColor Yellow
183
+ Remove-ResidualEksStacks -Cluster $ClusterName -AwsRegion $Region
184
+ Write-Host "OK: Residual EKS stacks cleaned" -ForegroundColor Green
185
+
186
+ # --- Step 2: Delete AMP Workspace ---
187
+ Write-Host ""
188
+ Write-Host ">>> Step 2: Deleting AMP workspace..." -ForegroundColor Yellow
189
+
190
+ $AmpWsId = Get-AmpWorkspaceIdByAlias -Alias $AmpAlias -AwsRegion $Region
191
+
192
+ if (-not [string]::IsNullOrWhiteSpace($AmpWsId)) {
193
+ Invoke-CheckedCommand -Command { aws amp delete-workspace --workspace-id $AmpWsId --region $Region | Out-Null } -ErrorMessage "Failed to delete AMP workspace '$AmpWsId'"
194
+ Wait-AmpWorkspaceDeleted -WorkspaceId $AmpWsId -AwsRegion $Region
195
+ Write-Host "OK: AMP workspace deleted: $AmpWsId" -ForegroundColor Green
196
+ } else {
197
+ Write-Host "OK: AMP workspace not found, skipping" -ForegroundColor Green
198
+ }
199
+
200
+ # --- Step 3: Local kubeconfig cleanup ---
201
+ Write-Host ""
202
+ Write-Host ">>> Step 3: Cleaning local kubeconfig entries..." -ForegroundColor Yellow
203
+
204
+ try { kubectl config delete-context $ClusterName 2>$null | Out-Null } catch {}
205
+ try { kubectl config delete-cluster $ClusterName 2>$null | Out-Null } catch {}
206
+ try { kubectl config delete-user antiatropos-hf-user 2>$null | Out-Null } catch {}
207
+
208
+ if (Test-Path $GeneratedKubeconfig) {
209
+ Remove-Item $GeneratedKubeconfig -Force
210
+ Write-Host "OK: Removed generated kubeconfig file" -ForegroundColor Green
211
+ } else {
212
+ Write-Host "OK: Generated kubeconfig file not found, skipping" -ForegroundColor Green
213
+ }
214
+
215
+ # --- Step 4: Verify Cleanup ---
216
+ Write-Host ""
217
+ Write-Host ">>> Step 4: Verifying cleanup..." -ForegroundColor Yellow
218
+
219
+ $clusterStillExists = [bool](Get-EksClusterStatus -Name $ClusterName -AwsRegion $Region)
220
+
221
+ if ($clusterStillExists) {
222
+ Write-Host "WARN: Cluster still exists (deletion in progress)" -ForegroundColor Yellow
223
+ } else {
224
+ Write-Host "OK: Cluster deleted" -ForegroundColor Green
225
+ }
226
+
227
+ $ampStillExists = -not [string]::IsNullOrWhiteSpace((Get-AmpWorkspaceIdByAlias -Alias $AmpAlias -AwsRegion $Region))
228
+
229
+ if ($ampStillExists) {
230
+ Write-Host "WARN: AMP workspace alias '$AmpAlias' still exists" -ForegroundColor Yellow
231
+ } else {
232
+ Write-Host "OK: AMP workspace deleted" -ForegroundColor Green
233
+ }
234
+
235
+ # --- Done ---
236
+ Write-Host ""
237
+ Write-Host "==========================================" -ForegroundColor Green
238
+ Write-Host " Teardown Complete!" -ForegroundColor Green
239
+ Write-Host "==========================================" -ForegroundColor Green
240
+ Write-Host ""
241
+ Write-Host "All AWS infrastructure has been removed." -ForegroundColor Yellow
242
+ Write-Host ""
deploy/do/README.md ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DigitalOcean Droplet one-shot deploy
2
+
3
+ This deploy flow is for a single Ubuntu Droplet running:
4
+ - k3s (single-node Kubernetes)
5
+ - AntiAtropos sample workloads (`prod-sre`)
6
+ - Prometheus + Grafana (`monitoring`)
7
+ - lightweight control-plane API (`antiatropos-control` on port `8010`)
8
+
9
+ The OpenEnv runtime (`server.app`) is intentionally **not** run on the droplet.
10
+ The only supported split is:
11
+ - local machine: OpenEnv server + inference loop
12
+ - droplet: Kubernetes executor API + observability stack
13
+
14
+ ## Run
15
+
16
+ From repository root on the Droplet:
17
+
18
+ ```bash
19
+ sudo bash deploy/do/deploy-droplet-one-shot.sh
20
+ ```
21
+
22
+ Optional overrides:
23
+
24
+ ```bash
25
+ sudo REPO_DIR=/opt/AntiAtropos CONTROL_PORT=8010 MAX_REPLICAS=200 bash deploy/do/deploy-droplet-one-shot.sh
26
+ ```
27
+
28
+ ## What the script configures
29
+
30
+ - k3s kubelet with `max-pods=250`
31
+ - Prometheus service exposed on NodePort `30090`
32
+ - Prometheus scrape job for annotated pods in namespace `prod-sre`
33
+ - Env file at `.env.droplet` with:
34
+ - `KUBECONFIG=/etc/rancher/k3s/k3s.yaml`
35
+ - `ANTIATROPOS_WORKLOAD_MAP` for `node-0`..`node-4`
36
+ - Systemd service:
37
+ - Name: `antiatropos-control`
38
+ - Exec: `uvicorn server.local_laptop_control:app --host 0.0.0.0 --port 8010`
39
+ - Legacy cleanup:
40
+ - `antiatropos-fastapi` (VM OpenEnv service) is disabled/removed by default deploy path
41
+
42
+ ## Verify
43
+
44
+ ```bash
45
+ systemctl status antiatropos-control --no-pager
46
+ curl http://127.0.0.1:8010/health
47
+ kubectl get deploy -n prod-sre
48
+ kubectl get pods -n monitoring
49
+ curl http://127.0.0.1:30090/api/v1/targets
50
+ kubectl -n monitoring port-forward svc/grafana 3000:80
51
+ ```
52
+
53
+ Set local `.env` to use this consolidated path:
54
+
55
+ ```env
56
+ ENV_URL=http://localhost:8000
57
+ ANTIATROPOS_CONTROL_PLANE_URL=http://<droplet-ip>:8010
58
+ PROMETHEUS_URL=http://<droplet-ip>:30090
59
+ ```
60
+
61
+ ## Deterministic remote-scaling proof
62
+
63
+ On droplet, watch desired replicas:
64
+
65
+ ```bash
66
+ watch -n 1 'kubectl -n prod-sre get deploy -o custom-columns=NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas,AVAILABLE:.status.availableReplicas'
67
+ ```
68
+
69
+ From local machine, send one control action:
70
+
71
+ ```bash
72
+ curl -X POST http://<droplet-ip>:8010/step \
73
+ -H "Content-Type: application/json" \
74
+ -d '{"action_type":"SCALE_UP","target_node_id":"node-0","parameter":1.0}'
75
+ ```
76
+
77
+ If `payments` desired replicas increase, scaling is happening on droplet.
78
+
79
+ ## Troubleshooting
80
+
81
+ - **Pods do not move during inference**
82
+ - Verify local env points to droplet control API:
83
+ - `ANTIATROPOS_CONTROL_PLANE_URL=http://<droplet-ip>:8010`
84
+ - Check droplet control health:
85
+ - `curl http://127.0.0.1:8010/health`
86
+ - Check service status:
87
+ - `systemctl status antiatropos-control --no-pager`
88
+ - **Connection refused from local to droplet:8010**
89
+ - Service not running or firewall closed.
90
+ - Start service and open firewall if needed.
91
+ - **Need to remove legacy VM OpenEnv service**
92
+ - `sudo bash deploy/do/uninstall-legacy-openenv.sh`
deploy/do/antiatropos-control.service ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Unit]
2
+ Description=AntiAtropos Droplet Control API
3
+ After=network-online.target k3s.service
4
+ Wants=network-online.target
5
+
6
+ [Service]
7
+ Type=simple
8
+ User=root
9
+ WorkingDirectory=/root/Anti-Atropos
10
+ EnvironmentFile=/root/Anti-Atropos/.env.droplet
11
+ ExecStart=/root/Anti-Atropos/.venv-droplet/bin/uvicorn server.local_laptop_control:app --host 0.0.0.0 --port 8010
12
+ Restart=always
13
+ RestartSec=3
14
+
15
+ [Install]
16
+ WantedBy=multi-user.target
deploy/do/deploy-droplet-one-shot.sh ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # One-shot deploy for a single DigitalOcean Droplet:
5
+ # - Installs k3s with kubelet max-pods=250
6
+ # - Deploys workloads + Prometheus + Grafana
7
+ # - Creates env file for live Kubernetes scaling
8
+ # - Starts lightweight control-plane API via systemd (antiatropos-control)
9
+
10
+ if [[ "${EUID}" -ne 0 ]]; then
11
+ echo "Run as root: sudo bash deploy/do/deploy-droplet-one-shot.sh"
12
+ exit 1
13
+ fi
14
+
15
+ REPO_DIR="${REPO_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
16
+ KUBECONFIG_PATH="${KUBECONFIG_PATH:-/etc/rancher/k3s/k3s.yaml}"
17
+ CONTROL_PORT="${CONTROL_PORT:-8010}"
18
+ CONTROL_HOST="${CONTROL_HOST:-0.0.0.0}"
19
+ K8S_NAMESPACE="${K8S_NAMESPACE:-prod-sre}"
20
+ MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
21
+ PY_VENV_DIR="${PY_VENV_DIR:-${REPO_DIR}/.venv-droplet}"
22
+ ENV_FILE="${ENV_FILE:-${REPO_DIR}/.env.droplet}"
23
+ MIN_REPLICAS="${MIN_REPLICAS:-1}"
24
+ MAX_REPLICAS="${MAX_REPLICAS:-250}"
25
+ SCALE_STEP="${SCALE_STEP:-3}"
26
+ WORKLOAD_MAP="${WORKLOAD_MAP:-{\"node-0\":{\"deployment\":\"payments\",\"namespace\":\"prod-sre\"},\"node-1\":{\"deployment\":\"checkout\",\"namespace\":\"prod-sre\"},\"node-2\":{\"deployment\":\"catalog\",\"namespace\":\"prod-sre\"},\"node-3\":{\"deployment\":\"cart\",\"namespace\":\"prod-sre\"},\"node-4\":{\"deployment\":\"auth\",\"namespace\":\"prod-sre\"}}}"
27
+
28
+ echo "=== AntiAtropos Droplet One-Shot Deploy ==="
29
+ echo "Repo: ${REPO_DIR}"
30
+ echo "Kubeconfig: ${KUBECONFIG_PATH}"
31
+ echo "Control API: ${CONTROL_HOST}:${CONTROL_PORT}"
32
+ echo ""
33
+
34
+ if [[ ! -f "${REPO_DIR}/deploy/local-laptop.yaml" ]]; then
35
+ echo "ERROR: deploy/local-laptop.yaml not found. Run from AntiAtropos checkout."
36
+ exit 1
37
+ fi
38
+
39
+ export DEBIAN_FRONTEND=noninteractive
40
+ apt-get update
41
+ apt-get install -y curl ca-certificates gnupg lsb-release python3 python3-venv python3-pip
42
+
43
+ if ! command -v kubectl >/dev/null 2>&1; then
44
+ echo "Installing k3s (includes kubectl)..."
45
+ curl -sfL https://get.k3s.io | sh -s - --write-kubeconfig-mode 644 --kubelet-arg=max-pods=250
46
+ else
47
+ echo "k3s/kubectl already present; skipping k3s install."
48
+ fi
49
+
50
+ if ! command -v helm >/dev/null 2>&1; then
51
+ echo "Installing Helm..."
52
+ curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
53
+ fi
54
+
55
+ export KUBECONFIG="${KUBECONFIG_PATH}"
56
+
57
+ echo "Waiting for Kubernetes node to be Ready..."
58
+ kubectl wait --for=condition=Ready node --all --timeout=180s
59
+
60
+ kubectl create ns "${K8S_NAMESPACE}" >/dev/null 2>&1 || true
61
+ kubectl create ns "${MONITORING_NAMESPACE}" >/dev/null 2>&1 || true
62
+
63
+ echo "Deploying AntiAtropos workloads..."
64
+ kubectl apply -f "${REPO_DIR}/deploy/local-laptop.yaml"
65
+
66
+ echo "Installing/upgrading Prometheus + Grafana..."
67
+ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true
68
+ helm repo add grafana https://grafana.github.io/helm-charts >/dev/null 2>&1 || true
69
+ helm repo update
70
+
71
+ helm upgrade --install prometheus prometheus-community/prometheus \
72
+ -n "${MONITORING_NAMESPACE}" \
73
+ -f "${REPO_DIR}/deploy/prometheus-helm-values.yaml"
74
+
75
+ if [[ -d "${REPO_DIR}/deploy/grafana/provisioning/dashboards/json" ]]; then
76
+ kubectl delete configmap grafana-dashboards -n "${MONITORING_NAMESPACE}" >/dev/null 2>&1 || true
77
+ kubectl create configmap grafana-dashboards \
78
+ -n "${MONITORING_NAMESPACE}" \
79
+ --from-file="${REPO_DIR}/deploy/grafana/provisioning/dashboards/json/"
80
+ fi
81
+
82
+ helm upgrade --install grafana grafana/grafana \
83
+ -n "${MONITORING_NAMESPACE}" \
84
+ -f "${REPO_DIR}/deploy/grafana-helm-values.yaml"
85
+
86
+ echo "Exposing Grafana on NodePort 30000..."
87
+ kubectl patch svc grafana -n "${MONITORING_NAMESPACE}" --type='merge' -p '{
88
+ "spec": {
89
+ "type": "NodePort",
90
+ "ports": [
91
+ {"port": 80, "nodePort": 30000, "targetPort": 3000, "name": "service"}
92
+ ]
93
+ }
94
+ }' || true
95
+
96
+ echo "Waiting for Grafana pods to be ready..."
97
+ kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=grafana -n "${MONITORING_NAMESPACE}" --timeout=180s || true
98
+
99
+ if [[ ! -f "${ENV_FILE}" ]]; then
100
+ cat > "${ENV_FILE}" <<EOF
101
+ KUBECONFIG=/etc/rancher/k3s/k3s.yaml
102
+ ANTIATROPOS_K8S_NAMESPACE=prod-sre
103
+ ANTIATROPOS_MIN_REPLICAS=${MIN_REPLICAS}
104
+ ANTIATROPOS_MAX_REPLICAS=${MAX_REPLICAS}
105
+ ANTIATROPOS_SCALE_STEP=${SCALE_STEP}
106
+ ANTIATROPOS_WORKLOAD_MAP=${WORKLOAD_MAP}
107
+ EOF
108
+ echo "Created ${ENV_FILE}"
109
+ else
110
+ echo "Using existing ${ENV_FILE}"
111
+ fi
112
+
113
+ echo "Preparing Python environment..."
114
+ python3 -m venv "${PY_VENV_DIR}"
115
+ "${PY_VENV_DIR}/bin/python" -m pip install --upgrade pip
116
+ if [[ -f "${REPO_DIR}/pyproject.toml" ]]; then
117
+ # Prefer project metadata (uses openenv-core, not legacy openenv package name).
118
+ "${PY_VENV_DIR}/bin/pip" install -e "${REPO_DIR}"
119
+ else
120
+ "${PY_VENV_DIR}/bin/pip" install -r "${REPO_DIR}/server/requirements.txt"
121
+ fi
122
+
123
+ # Hard cleanup: remove legacy VM OpenEnv service if it exists.
124
+ if systemctl list-unit-files | grep -q '^antiatropos-fastapi\.service'; then
125
+ echo "Disabling legacy service antiatropos-fastapi..."
126
+ systemctl disable --now antiatropos-fastapi >/dev/null 2>&1 || true
127
+ rm -f /etc/systemd/system/antiatropos-fastapi.service
128
+ fi
129
+
130
+ cat > /etc/systemd/system/antiatropos-control.service <<EOF
131
+ [Unit]
132
+ Description=AntiAtropos Droplet Control API
133
+ After=network-online.target k3s.service
134
+ Wants=network-online.target
135
+
136
+ [Service]
137
+ Type=simple
138
+ User=root
139
+ WorkingDirectory=${REPO_DIR}
140
+ EnvironmentFile=${ENV_FILE}
141
+ ExecStart=${PY_VENV_DIR}/bin/uvicorn server.local_laptop_control:app --host ${CONTROL_HOST} --port ${CONTROL_PORT}
142
+ Restart=always
143
+ RestartSec=3
144
+
145
+ [Install]
146
+ WantedBy=multi-user.target
147
+ EOF
148
+
149
+ systemctl daemon-reload
150
+ systemctl enable --now antiatropos-control
151
+
152
+ echo ""
153
+ echo "Waiting for control API readiness..."
154
+ for _ in {1..30}; do
155
+ if curl -fsS "http://127.0.0.1:${CONTROL_PORT}/health" >/dev/null 2>&1; then
156
+ break
157
+ fi
158
+ sleep 2
159
+ done
160
+
161
+ PUBLIC_IP="$(curl -fsS https://api.ipify.org 2>/dev/null || true)"
162
+ if [[ -z "${PUBLIC_IP}" ]]; then
163
+ PUBLIC_IP="$(hostname -I 2>/dev/null | awk '{print $1}')"
164
+ fi
165
+ PROM_URL_DISPLAY="http://${PUBLIC_IP:-<droplet-ip>}:30090"
166
+
167
+ echo ""
168
+ echo "=== Deploy Complete ==="
169
+ echo "Control health: http://127.0.0.1:${CONTROL_PORT}/health"
170
+ echo "Control step: http://127.0.0.1:${CONTROL_PORT}/step"
171
+ echo "Prometheus svc: kubectl -n ${MONITORING_NAMESPACE} get svc prometheus-server"
172
+ echo "Prometheus URL: ${PROM_URL_DISPLAY}"
173
+ echo "Grafana URL: http://${PUBLIC_IP:-<droplet-ip>}:30000 (admin / antiatropos)"
174
+ echo ""
175
+ echo "Service status command:"
176
+ echo " systemctl status antiatropos-control --no-pager"
177
+ echo ""
178
+ echo "If needed, edit env and restart control service:"
179
+ echo " ${ENV_FILE}"
180
+ echo " systemctl restart antiatropos-control"
181
+ echo ""
182
+ echo "Verify remote scaling path:"
183
+ echo " watch -n 1 'kubectl -n prod-sre get deploy -o custom-columns=NAME:.metadata.name,DESIRED:.spec.replicas,READY:.status.readyReplicas'"
deploy/do/uninstall-legacy-openenv.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Removes legacy VM OpenEnv service path.
5
+ # This keeps droplet runtime focused on control API + observability only.
6
+
7
+ if [[ "${EUID}" -ne 0 ]]; then
8
+ echo "Run as root: sudo bash deploy/do/uninstall-legacy-openenv.sh"
9
+ exit 1
10
+ fi
11
+
12
+ if systemctl list-unit-files | grep -q '^antiatropos-fastapi\.service'; then
13
+ echo "Stopping and disabling antiatropos-fastapi..."
14
+ systemctl disable --now antiatropos-fastapi >/dev/null 2>&1 || true
15
+ else
16
+ echo "antiatropos-fastapi service not registered."
17
+ fi
18
+
19
+ if [[ -f /etc/systemd/system/antiatropos-fastapi.service ]]; then
20
+ rm -f /etc/systemd/system/antiatropos-fastapi.service
21
+ echo "Removed /etc/systemd/system/antiatropos-fastapi.service"
22
+ fi
23
+
24
+ systemctl daemon-reload
25
+ echo "Legacy VM OpenEnv service cleanup complete."
deploy/entrypoint.sh CHANGED
@@ -1,62 +1,71 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
-
4
- FASTAPI_PID=""
5
- PROMETHEUS_PID=""
6
- GRAFANA_PID=""
7
- NGINX_PID=""
8
- MONITOR_PID=""
9
-
10
- cleanup() {
11
- for pid in "${MONITOR_PID}" "${NGINX_PID}" "${GRAFANA_PID}" "${PROMETHEUS_PID}" "${FASTAPI_PID}"; do
12
- if [[ -n "${pid}" ]]; then
13
- kill "${pid}" 2>/dev/null || true
14
- fi
15
- done
16
- }
17
-
18
- trap cleanup INT TERM EXIT
19
-
20
- cd /app
21
-
22
- uvicorn server.app:app --host 127.0.0.1 --port 8000 &
23
- FASTAPI_PID=$!
24
-
25
- /opt/prometheus/prometheus \
26
- --config.file=/etc/prometheus/prometheus.yml \
27
- --storage.tsdb.path=/tmp/prometheus-data \
28
- --web.listen-address=127.0.0.1:9090 \
29
- --web.route-prefix=/prometheus \
30
- &
31
- PROMETHEUS_PID=$!
32
-
33
- /opt/grafana/bin/grafana-server \
34
- --homepath /opt/grafana \
35
- --config /etc/grafana/grafana.ini \
36
- cfg:default.paths.data=/var/lib/grafana \
37
- cfg:default.paths.logs=/var/log/grafana \
38
- cfg:default.paths.plugins=/var/lib/grafana/plugins \
39
- cfg:default.paths.provisioning=/etc/grafana/provisioning \
40
- &
41
- GRAFANA_PID=$!
42
-
43
- nginx -g "daemon off;" &
44
- NGINX_PID=$!
45
-
46
- monitor_children() {
47
- while true; do
48
- for pid in "${FASTAPI_PID}" "${PROMETHEUS_PID}" "${GRAFANA_PID}"; do
49
- if ! kill -0 "${pid}" 2>/dev/null; then
50
- echo "A backend service exited unexpectedly." >&2
51
- kill "${NGINX_PID}" 2>/dev/null || true
52
- exit 1
53
- fi
54
- done
55
- sleep 2
56
- done
57
- }
58
-
59
- monitor_children &
60
- MONITOR_PID=$!
61
-
62
- wait "${NGINX_PID}"
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ FASTAPI_PID=""
5
+ PROMETHEUS_PID=""
6
+ GRAFANA_PID=""
7
+ NGINX_PID=""
8
+ MONITOR_PID=""
9
+
10
+ cleanup() {
11
+ for pid in "${MONITOR_PID}" "${NGINX_PID}" "${GRAFANA_PID}" "${PROMETHEUS_PID}" "${FASTAPI_PID}"; do
12
+ if [[ -n "${pid}" ]]; then
13
+ kill "${pid}" 2>/dev/null || true
14
+ fi
15
+ done
16
+ }
17
+
18
+ trap cleanup INT TERM EXIT
19
+
20
+ cd /app
21
+
22
+ # Source HF Spaces live-mode config if present (overrides Dockerfile defaults)
23
+ if [[ -f /app/.env.hf ]]; then
24
+ echo "Loading .env.hf..."
25
+ set -a
26
+ # shellcheck source=/dev/null
27
+ source /app/.env.hf
28
+ set +a
29
+ fi
30
+
31
+ uvicorn server.app:app --host 127.0.0.1 --port 8000 &
32
+ FASTAPI_PID=$!
33
+
34
+ /opt/prometheus/prometheus \
35
+ --config.file=/etc/prometheus/prometheus.yml \
36
+ --storage.tsdb.path=/tmp/prometheus-data \
37
+ --web.listen-address=127.0.0.1:9090 \
38
+ --web.route-prefix=/prometheus \
39
+ &
40
+ PROMETHEUS_PID=$!
41
+
42
+ /opt/grafana/bin/grafana-server \
43
+ --homepath /opt/grafana \
44
+ --config /etc/grafana/grafana.ini \
45
+ cfg:default.paths.data=/var/lib/grafana \
46
+ cfg:default.paths.logs=/var/log/grafana \
47
+ cfg:default.paths.plugins=/var/lib/grafana/plugins \
48
+ cfg:default.paths.provisioning=/etc/grafana/provisioning \
49
+ &
50
+ GRAFANA_PID=$!
51
+
52
+ nginx -g "daemon off;" &
53
+ NGINX_PID=$!
54
+
55
+ monitor_children() {
56
+ while true; do
57
+ for pid in "${FASTAPI_PID}" "${PROMETHEUS_PID}" "${GRAFANA_PID}"; do
58
+ if ! kill -0 "${pid}" 2>/dev/null; then
59
+ echo "A backend service exited unexpectedly." >&2
60
+ kill "${NGINX_PID}" 2>/dev/null || true
61
+ exit 1
62
+ fi
63
+ done
64
+ sleep 2
65
+ done
66
+ }
67
+
68
+ monitor_children &
69
+ MONITOR_PID=$!
70
+
71
+ wait "${NGINX_PID}"
deploy/grafana-datasource-local.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Grafana datasource provisioning - points to in-cluster Prometheus
2
+ apiVersion: 1
3
+
4
+ datasources:
5
+ - name: Prometheus
6
+ uid: PBFA97CFB590B2093
7
+ type: prometheus
8
+ access: proxy
9
+ url: http://prometheus-server.monitoring.svc.cluster.local
10
+ isDefault: true
11
+ editable: true
deploy/grafana-helm-values.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Grafana self-hosted on Kind - Simplified dashboard + datasource setup
2
+
3
+ adminUser: admin
4
+ adminPassword: antiatropos
5
+
6
+ service:
7
+ type: ClusterIP
8
+ port: 80
9
+
10
+ persistence:
11
+ enabled: false
12
+
13
+ # Datasource provisioning - mount as separate file
14
+ datasources:
15
+ datasources.yaml:
16
+ apiVersion: 1
17
+ datasources:
18
+ - name: Prometheus
19
+ uid: PBFA97CFB590B2093
20
+ type: prometheus
21
+ access: proxy
22
+ url: http://prometheus-server.monitoring.svc.cluster.local
23
+ isDefault: true
24
+ editable: true
25
+
26
+ # Dashboard provider config
27
+ dashboardProviders:
28
+ dashboardproviders.yaml:
29
+ apiVersion: 1
30
+ providers:
31
+ - name: AntiAtropos
32
+ orgId: 1
33
+ folder: AntiAtropos
34
+ type: file
35
+ disableDeletion: false
36
+ editable: true
37
+ updateIntervalSeconds: 30
38
+ options:
39
+ path: /var/lib/grafana/dashboards/antiatropos
40
+
41
+ # Mount dashboard JSONs from ConfigMap
42
+ extraConfigmapMounts:
43
+ - name: grafana-dashboards
44
+ configMap: grafana-dashboards
45
+ mountPath: /var/lib/grafana/dashboards/antiatropos
46
+ readOnly: true
deploy/grafana/grafana.ini CHANGED
@@ -1,21 +1,21 @@
1
- [server]
2
- http_addr = 127.0.0.1
3
- http_port = 3000
4
- domain = localhost
5
- root_url = /grafana/
6
- serve_from_sub_path = true
7
- router_logging = false
8
- enable_gzip = true
9
-
10
- [auth]
11
- disable_login_form = false
12
-
13
- [auth.anonymous]
14
- enabled = true
15
- org_role = Viewer
16
-
17
- [dashboards]
18
- default_home_dashboard_path = /etc/grafana/provisioning/dashboards/json/antiatropos-overview.json
19
-
20
- [security]
21
- allow_embedding = true
 
1
+ [server]
2
+ http_addr = 127.0.0.1
3
+ http_port = 3000
4
+ domain = localhost
5
+ root_url = /grafana/
6
+ serve_from_sub_path = true
7
+ router_logging = false
8
+ enable_gzip = true
9
+
10
+ [auth]
11
+ disable_login_form = false
12
+
13
+ [auth.anonymous]
14
+ enabled = true
15
+ org_role = Viewer
16
+
17
+ [dashboards]
18
+ default_home_dashboard_path = /etc/grafana/provisioning/dashboards/json/antiatropos-overview.json
19
+
20
+ [security]
21
+ allow_embedding = true
deploy/grafana/provisioning/dashboards/dashboard.yaml CHANGED
@@ -1,12 +1,12 @@
1
- apiVersion: 1
2
-
3
- providers:
4
- - name: AntiAtropos Dashboards
5
- orgId: 1
6
- folder: AntiAtropos
7
- type: file
8
- disableDeletion: false
9
- editable: true
10
- updateIntervalSeconds: 30
11
- options:
12
- path: /etc/grafana/provisioning/dashboards/json
 
1
+ apiVersion: 1
2
+
3
+ providers:
4
+ - name: AntiAtropos Dashboards
5
+ orgId: 1
6
+ folder: AntiAtropos
7
+ type: file
8
+ disableDeletion: false
9
+ editable: true
10
+ updateIntervalSeconds: 30
11
+ options:
12
+ path: /etc/grafana/provisioning/dashboards/json
deploy/grafana/provisioning/dashboards/json/antiatropos-live.json CHANGED
@@ -1,334 +1,334 @@
1
- {
2
- "annotations": {
3
- "list": [
4
- {
5
- "builtIn": 1,
6
- "datasource": {
7
- "type": "grafana",
8
- "uid": "-- Grafana --"
9
- },
10
- "enable": true,
11
- "hide": true,
12
- "iconColor": "rgba(0, 211, 255, 1)",
13
- "name": "Annotations & Alerts",
14
- "type": "dashboard"
15
- }
16
- ]
17
- },
18
- "editable": true,
19
- "fiscalYearStartMonth": 0,
20
- "graphTooltip": 0,
21
- "id": null,
22
- "links": [],
23
- "liveNow": false,
24
- "panels": [
25
- {
26
- "datasource": {
27
- "type": "prometheus",
28
- "uid": "PBFA97CFB590B2093"
29
- },
30
- "fieldConfig": {
31
- "defaults": {
32
- "color": {
33
- "mode": "palette-classic"
34
- }
35
- },
36
- "overrides": []
37
- },
38
- "gridPos": {
39
- "h": 7,
40
- "w": 12,
41
- "x": 0,
42
- "y": 0
43
- },
44
- "id": 1,
45
- "options": {
46
- "legend": {
47
- "calcs": [],
48
- "displayMode": "list",
49
- "placement": "bottom"
50
- },
51
- "tooltip": {
52
- "mode": "single"
53
- }
54
- },
55
- "targets": [
56
- {
57
- "expr": "sum by (action_type, ack_class) (rate(antiatropos_actions_total{task_id=~\"$task\",mode=~\"$mode\"}[1m]))",
58
- "legendFormat": "{{action_type}} {{ack_class}}",
59
- "refId": "A"
60
- }
61
- ],
62
- "title": "Actions Per Second",
63
- "type": "timeseries"
64
- },
65
- {
66
- "datasource": {
67
- "type": "prometheus",
68
- "uid": "PBFA97CFB590B2093"
69
- },
70
- "fieldConfig": {
71
- "defaults": {
72
- "color": {
73
- "mode": "palette-classic"
74
- },
75
- "min": 0,
76
- "max": 1
77
- },
78
- "overrides": []
79
- },
80
- "gridPos": {
81
- "h": 7,
82
- "w": 12,
83
- "x": 12,
84
- "y": 0
85
- },
86
- "id": 2,
87
- "options": {
88
- "legend": {
89
- "calcs": [],
90
- "displayMode": "table",
91
- "placement": "bottom"
92
- },
93
- "tooltip": {
94
- "mode": "single"
95
- }
96
- },
97
- "targets": [
98
- {
99
- "expr": "antiatropos_reward_normalized{task_id=~\"$task\",mode=~\"$mode\"}",
100
- "legendFormat": "{{task_id}}/{{mode}} normalized",
101
- "refId": "A"
102
- }
103
- ],
104
- "title": "Normalized Reward [0,1]",
105
- "type": "timeseries"
106
- },
107
- {
108
- "datasource": {
109
- "type": "prometheus",
110
- "uid": "PBFA97CFB590B2093"
111
- },
112
- "fieldConfig": {
113
- "defaults": {
114
- "color": {
115
- "mode": "palette-classic"
116
- }
117
- },
118
- "overrides": []
119
- },
120
- "gridPos": {
121
- "h": 7,
122
- "w": 12,
123
- "x": 0,
124
- "y": 7
125
- },
126
- "id": 3,
127
- "options": {
128
- "legend": {
129
- "calcs": [],
130
- "displayMode": "table",
131
- "placement": "bottom"
132
- },
133
- "tooltip": {
134
- "mode": "single"
135
- }
136
- },
137
- "targets": [
138
- {
139
- "expr": "antiatropos_reward_raw{task_id=~\"$task\",mode=~\"$mode\"}",
140
- "legendFormat": "{{task_id}}/{{mode}} raw",
141
- "refId": "A"
142
- }
143
- ],
144
- "title": "Raw Reward",
145
- "type": "timeseries"
146
- },
147
- {
148
- "datasource": {
149
- "type": "prometheus",
150
- "uid": "PBFA97CFB590B2093"
151
- },
152
- "fieldConfig": {
153
- "defaults": {
154
- "color": {
155
- "mode": "palette-classic"
156
- }
157
- },
158
- "overrides": []
159
- },
160
- "gridPos": {
161
- "h": 7,
162
- "w": 12,
163
- "x": 12,
164
- "y": 7
165
- },
166
- "id": 4,
167
- "options": {
168
- "legend": {
169
- "calcs": [],
170
- "displayMode": "table",
171
- "placement": "bottom"
172
- },
173
- "tooltip": {
174
- "mode": "single"
175
- }
176
- },
177
- "targets": [
178
- {
179
- "expr": "antiatropos_total_queue_backlog{task_id=~\"$task\",mode=~\"$mode\"}",
180
- "legendFormat": "{{task_id}}/{{mode}} queue",
181
- "refId": "A"
182
- },
183
- {
184
- "expr": "antiatropos_average_latency_norm{task_id=~\"$task\",mode=~\"$mode\"}",
185
- "legendFormat": "{{task_id}}/{{mode}} latency",
186
- "refId": "B"
187
- }
188
- ],
189
- "title": "Queue Backlog and Latency (Norm)",
190
- "type": "timeseries"
191
- },
192
- {
193
- "datasource": {
194
- "type": "prometheus",
195
- "uid": "PBFA97CFB590B2093"
196
- },
197
- "fieldConfig": {
198
- "defaults": {
199
- "color": {
200
- "mode": "palette-classic"
201
- }
202
- },
203
- "overrides": []
204
- },
205
- "gridPos": {
206
- "h": 7,
207
- "w": 12,
208
- "x": 0,
209
- "y": 14
210
- },
211
- "id": 5,
212
- "options": {
213
- "legend": {
214
- "calcs": [],
215
- "displayMode": "table",
216
- "placement": "bottom"
217
- },
218
- "tooltip": {
219
- "mode": "single"
220
- }
221
- },
222
- "targets": [
223
- {
224
- "expr": "antiatropos_lyapunov_energy{task_id=~\"$task\",mode=~\"$mode\"}",
225
- "legendFormat": "{{task_id}}/{{mode}}",
226
- "refId": "A"
227
- }
228
- ],
229
- "title": "Lyapunov Energy",
230
- "type": "timeseries"
231
- },
232
- {
233
- "datasource": {
234
- "type": "prometheus",
235
- "uid": "PBFA97CFB590B2093"
236
- },
237
- "fieldConfig": {
238
- "defaults": {
239
- "color": {
240
- "mode": "palette-classic"
241
- }
242
- },
243
- "overrides": []
244
- },
245
- "gridPos": {
246
- "h": 7,
247
- "w": 12,
248
- "x": 12,
249
- "y": 14
250
- },
251
- "id": 6,
252
- "options": {
253
- "legend": {
254
- "calcs": [],
255
- "displayMode": "table",
256
- "placement": "bottom"
257
- },
258
- "tooltip": {
259
- "mode": "single"
260
- }
261
- },
262
- "targets": [
263
- {
264
- "expr": "histogram_quantile(0.95, sum(rate(antiatropos_executor_latency_ms_bucket{mode=~\"$mode\"}[2m])) by (le, mode))",
265
- "legendFormat": "p95 {{mode}}",
266
- "refId": "A"
267
- },
268
- {
269
- "expr": "sum by (mode, error_code) (rate(antiatropos_executor_errors_total{mode=~\"$mode\"}[5m]))",
270
- "legendFormat": "{{mode}} {{error_code}}",
271
- "refId": "B"
272
- }
273
- ],
274
- "title": "Executor Latency p95 and Errors/s",
275
- "type": "timeseries"
276
- }
277
- ],
278
- "refresh": "5s",
279
- "schemaVersion": 39,
280
- "style": "dark",
281
- "tags": [
282
- "antiatropos",
283
- "sre",
284
- "rl"
285
- ],
286
- "templating": {
287
- "list": [
288
- {
289
- "datasource": {
290
- "type": "prometheus",
291
- "uid": "PBFA97CFB590B2093"
292
- },
293
- "definition": "label_values(antiatropos_steps_total, task_id)",
294
- "includeAll": true,
295
- "multi": true,
296
- "name": "task",
297
- "query": {
298
- "qryType": 1,
299
- "query": "label_values(antiatropos_steps_total, task_id)",
300
- "refId": "TaskVar"
301
- },
302
- "refresh": 2,
303
- "type": "query"
304
- },
305
- {
306
- "datasource": {
307
- "type": "prometheus",
308
- "uid": "PBFA97CFB590B2093"
309
- },
310
- "definition": "label_values(antiatropos_steps_total, mode)",
311
- "includeAll": true,
312
- "multi": true,
313
- "name": "mode",
314
- "query": {
315
- "qryType": 1,
316
- "query": "label_values(antiatropos_steps_total, mode)",
317
- "refId": "ModeVar"
318
- },
319
- "refresh": 2,
320
- "type": "query"
321
- }
322
- ]
323
- },
324
- "time": {
325
- "from": "now-15m",
326
- "to": "now"
327
- },
328
- "timepicker": {},
329
- "timezone": "",
330
- "title": "AntiAtropos Live Control Plane",
331
- "uid": "antiatropos-live",
332
- "version": 2,
333
- "weekStart": ""
334
- }
 
1
+ {
2
+ "annotations": {
3
+ "list": [
4
+ {
5
+ "builtIn": 1,
6
+ "datasource": {
7
+ "type": "grafana",
8
+ "uid": "-- Grafana --"
9
+ },
10
+ "enable": true,
11
+ "hide": true,
12
+ "iconColor": "rgba(0, 211, 255, 1)",
13
+ "name": "Annotations & Alerts",
14
+ "type": "dashboard"
15
+ }
16
+ ]
17
+ },
18
+ "editable": true,
19
+ "fiscalYearStartMonth": 0,
20
+ "graphTooltip": 0,
21
+ "id": null,
22
+ "links": [],
23
+ "liveNow": false,
24
+ "panels": [
25
+ {
26
+ "datasource": {
27
+ "type": "prometheus",
28
+ "uid": "PBFA97CFB590B2093"
29
+ },
30
+ "fieldConfig": {
31
+ "defaults": {
32
+ "color": {
33
+ "mode": "palette-classic"
34
+ }
35
+ },
36
+ "overrides": []
37
+ },
38
+ "gridPos": {
39
+ "h": 7,
40
+ "w": 12,
41
+ "x": 0,
42
+ "y": 0
43
+ },
44
+ "id": 1,
45
+ "options": {
46
+ "legend": {
47
+ "calcs": [],
48
+ "displayMode": "list",
49
+ "placement": "bottom"
50
+ },
51
+ "tooltip": {
52
+ "mode": "single"
53
+ }
54
+ },
55
+ "targets": [
56
+ {
57
+ "expr": "sum by (action_type, ack_class) (rate(antiatropos_actions_total{task_id=~\"$task\",mode=~\"$mode\"}[5m]))",
58
+ "legendFormat": "{{action_type}} {{ack_class}}",
59
+ "refId": "A"
60
+ }
61
+ ],
62
+ "title": "Actions Per Second",
63
+ "type": "timeseries"
64
+ },
65
+ {
66
+ "datasource": {
67
+ "type": "prometheus",
68
+ "uid": "PBFA97CFB590B2093"
69
+ },
70
+ "fieldConfig": {
71
+ "defaults": {
72
+ "color": {
73
+ "mode": "palette-classic"
74
+ },
75
+ "min": 0,
76
+ "max": 1
77
+ },
78
+ "overrides": []
79
+ },
80
+ "gridPos": {
81
+ "h": 7,
82
+ "w": 12,
83
+ "x": 12,
84
+ "y": 0
85
+ },
86
+ "id": 2,
87
+ "options": {
88
+ "legend": {
89
+ "calcs": [],
90
+ "displayMode": "table",
91
+ "placement": "bottom"
92
+ },
93
+ "tooltip": {
94
+ "mode": "single"
95
+ }
96
+ },
97
+ "targets": [
98
+ {
99
+ "expr": "antiatropos_reward_normalized{task_id=~\"$task\",mode=~\"$mode\"}",
100
+ "legendFormat": "{{task_id}}/{{mode}} normalized",
101
+ "refId": "A"
102
+ }
103
+ ],
104
+ "title": "Normalized Reward [0,1]",
105
+ "type": "timeseries"
106
+ },
107
+ {
108
+ "datasource": {
109
+ "type": "prometheus",
110
+ "uid": "PBFA97CFB590B2093"
111
+ },
112
+ "fieldConfig": {
113
+ "defaults": {
114
+ "color": {
115
+ "mode": "palette-classic"
116
+ }
117
+ },
118
+ "overrides": []
119
+ },
120
+ "gridPos": {
121
+ "h": 7,
122
+ "w": 12,
123
+ "x": 0,
124
+ "y": 7
125
+ },
126
+ "id": 3,
127
+ "options": {
128
+ "legend": {
129
+ "calcs": [],
130
+ "displayMode": "table",
131
+ "placement": "bottom"
132
+ },
133
+ "tooltip": {
134
+ "mode": "single"
135
+ }
136
+ },
137
+ "targets": [
138
+ {
139
+ "expr": "antiatropos_reward_raw{task_id=~\"$task\",mode=~\"$mode\"}",
140
+ "legendFormat": "{{task_id}}/{{mode}} raw",
141
+ "refId": "A"
142
+ }
143
+ ],
144
+ "title": "Raw Reward",
145
+ "type": "timeseries"
146
+ },
147
+ {
148
+ "datasource": {
149
+ "type": "prometheus",
150
+ "uid": "PBFA97CFB590B2093"
151
+ },
152
+ "fieldConfig": {
153
+ "defaults": {
154
+ "color": {
155
+ "mode": "palette-classic"
156
+ }
157
+ },
158
+ "overrides": []
159
+ },
160
+ "gridPos": {
161
+ "h": 7,
162
+ "w": 12,
163
+ "x": 12,
164
+ "y": 7
165
+ },
166
+ "id": 4,
167
+ "options": {
168
+ "legend": {
169
+ "calcs": [],
170
+ "displayMode": "table",
171
+ "placement": "bottom"
172
+ },
173
+ "tooltip": {
174
+ "mode": "single"
175
+ }
176
+ },
177
+ "targets": [
178
+ {
179
+ "expr": "antiatropos_total_queue_backlog{task_id=~\"$task\",mode=~\"$mode\"}",
180
+ "legendFormat": "{{task_id}}/{{mode}} queue",
181
+ "refId": "A"
182
+ },
183
+ {
184
+ "expr": "antiatropos_average_latency_norm{task_id=~\"$task\",mode=~\"$mode\"}",
185
+ "legendFormat": "{{task_id}}/{{mode}} latency",
186
+ "refId": "B"
187
+ }
188
+ ],
189
+ "title": "Queue Backlog and Latency (Norm)",
190
+ "type": "timeseries"
191
+ },
192
+ {
193
+ "datasource": {
194
+ "type": "prometheus",
195
+ "uid": "PBFA97CFB590B2093"
196
+ },
197
+ "fieldConfig": {
198
+ "defaults": {
199
+ "color": {
200
+ "mode": "palette-classic"
201
+ }
202
+ },
203
+ "overrides": []
204
+ },
205
+ "gridPos": {
206
+ "h": 7,
207
+ "w": 12,
208
+ "x": 0,
209
+ "y": 14
210
+ },
211
+ "id": 5,
212
+ "options": {
213
+ "legend": {
214
+ "calcs": [],
215
+ "displayMode": "table",
216
+ "placement": "bottom"
217
+ },
218
+ "tooltip": {
219
+ "mode": "single"
220
+ }
221
+ },
222
+ "targets": [
223
+ {
224
+ "expr": "antiatropos_lyapunov_energy{task_id=~\"$task\",mode=~\"$mode\"}",
225
+ "legendFormat": "{{task_id}}/{{mode}}",
226
+ "refId": "A"
227
+ }
228
+ ],
229
+ "title": "Lyapunov Energy",
230
+ "type": "timeseries"
231
+ },
232
+ {
233
+ "datasource": {
234
+ "type": "prometheus",
235
+ "uid": "PBFA97CFB590B2093"
236
+ },
237
+ "fieldConfig": {
238
+ "defaults": {
239
+ "color": {
240
+ "mode": "palette-classic"
241
+ }
242
+ },
243
+ "overrides": []
244
+ },
245
+ "gridPos": {
246
+ "h": 7,
247
+ "w": 12,
248
+ "x": 12,
249
+ "y": 14
250
+ },
251
+ "id": 6,
252
+ "options": {
253
+ "legend": {
254
+ "calcs": [],
255
+ "displayMode": "table",
256
+ "placement": "bottom"
257
+ },
258
+ "tooltip": {
259
+ "mode": "single"
260
+ }
261
+ },
262
+ "targets": [
263
+ {
264
+ "expr": "histogram_quantile(0.95, sum(rate(antiatropos_executor_latency_ms_bucket{mode=~\"$mode\"}[5m])) by (le, mode))",
265
+ "legendFormat": "p95 {{mode}}",
266
+ "refId": "A"
267
+ },
268
+ {
269
+ "expr": "sum by (mode, error_code) (rate(antiatropos_executor_errors_total{mode=~\"$mode\"}[5m]))",
270
+ "legendFormat": "{{mode}} {{error_code}}",
271
+ "refId": "B"
272
+ }
273
+ ],
274
+ "title": "Executor Latency p95 and Errors/s",
275
+ "type": "timeseries"
276
+ }
277
+ ],
278
+ "refresh": "5s",
279
+ "schemaVersion": 39,
280
+ "style": "dark",
281
+ "tags": [
282
+ "antiatropos",
283
+ "sre",
284
+ "rl"
285
+ ],
286
+ "templating": {
287
+ "list": [
288
+ {
289
+ "datasource": {
290
+ "type": "prometheus",
291
+ "uid": "PBFA97CFB590B2093"
292
+ },
293
+ "definition": "label_values(antiatropos_steps_total, task_id)",
294
+ "includeAll": true,
295
+ "multi": true,
296
+ "name": "task",
297
+ "query": {
298
+ "qryType": 1,
299
+ "query": "label_values(antiatropos_steps_total, task_id)",
300
+ "refId": "TaskVar"
301
+ },
302
+ "refresh": 2,
303
+ "type": "query"
304
+ },
305
+ {
306
+ "datasource": {
307
+ "type": "prometheus",
308
+ "uid": "PBFA97CFB590B2093"
309
+ },
310
+ "definition": "label_values(antiatropos_steps_total, mode)",
311
+ "includeAll": true,
312
+ "multi": true,
313
+ "name": "mode",
314
+ "query": {
315
+ "qryType": 1,
316
+ "query": "label_values(antiatropos_steps_total, mode)",
317
+ "refId": "ModeVar"
318
+ },
319
+ "refresh": 2,
320
+ "type": "query"
321
+ }
322
+ ]
323
+ },
324
+ "time": {
325
+ "from": "now-15m",
326
+ "to": "now"
327
+ },
328
+ "timepicker": {},
329
+ "timezone": "",
330
+ "title": "AntiAtropos Live Control Plane",
331
+ "uid": "antiatropos-live",
332
+ "version": 2,
333
+ "weekStart": ""
334
+ }
deploy/grafana/provisioning/dashboards/json/antiatropos-overview.json CHANGED
@@ -76,8 +76,8 @@
76
  "targets": [
77
  {
78
  "editorMode": "code",
79
- "expr": "scalar(avg(last_over_time(antiatropos_reward{mode=\"simulated\"}[1m])))",
80
- "legendFormat": "reward (simulated)",
81
  "range": true,
82
  "refId": "A"
83
  }
@@ -143,8 +143,8 @@
143
  "targets": [
144
  {
145
  "editorMode": "code",
146
- "expr": "scalar(avg(last_over_time(antiatropos_total_queue_backlog{mode=\"simulated\"}[1m])))",
147
- "legendFormat": "queue backlog (simulated)",
148
  "range": true,
149
  "refId": "A"
150
  }
@@ -210,8 +210,8 @@
210
  "targets": [
211
  {
212
  "editorMode": "code",
213
- "expr": "scalar(avg(last_over_time(antiatropos_average_latency_norm{mode=\"simulated\"}[1m])))",
214
- "legendFormat": "latency (simulated)",
215
  "range": true,
216
  "refId": "A"
217
  }
@@ -277,8 +277,8 @@
277
  "targets": [
278
  {
279
  "editorMode": "code",
280
- "expr": "scalar(avg(last_over_time(antiatropos_lyapunov_energy{mode=\"simulated\"}[1m])))",
281
- "legendFormat": "lyapunov energy (simulated)",
282
  "range": true,
283
  "refId": "A"
284
  }
@@ -369,14 +369,14 @@
369
  "targets": [
370
  {
371
  "editorMode": "code",
372
- "expr": "antiatropos_reward{mode=\"simulated\"}",
373
  "legendFormat": "reward {{task_id}} ({{mode}})",
374
  "range": true,
375
  "refId": "A"
376
  },
377
  {
378
  "editorMode": "code",
379
- "expr": "antiatropos_lyapunov_energy{mode=\"simulated\"}",
380
  "legendFormat": "lyapunov {{task_id}} ({{mode}})",
381
  "range": true,
382
  "refId": "B"
@@ -468,14 +468,14 @@
468
  "targets": [
469
  {
470
  "editorMode": "code",
471
- "expr": "antiatropos_total_queue_backlog{mode=\"simulated\"}",
472
  "legendFormat": "queue {{task_id}} ({{mode}})",
473
  "range": true,
474
  "refId": "A"
475
  },
476
  {
477
  "editorMode": "code",
478
- "expr": "antiatropos_average_latency_norm{mode=\"simulated\"}",
479
  "legendFormat": "latency {{task_id}} ({{mode}})",
480
  "range": true,
481
  "refId": "B"
@@ -535,14 +535,14 @@
535
  "targets": [
536
  {
537
  "editorMode": "code",
538
- "expr": "sum by (task_id, mode) (rate(antiatropos_steps_total{mode=\"simulated\"}[1m]))",
539
  "legendFormat": "steps/sec {{task_id}} ({{mode}})",
540
  "range": true,
541
  "refId": "A"
542
  },
543
  {
544
  "editorMode": "code",
545
- "expr": "sum by (task_id, mode, action_type) (rate(antiatropos_actions_total{mode=\"simulated\"}[1m]))",
546
  "legendFormat": "actions/sec {{action_type}} ({{task_id}}, {{mode}})",
547
  "range": true,
548
  "refId": "B"
@@ -602,14 +602,14 @@
602
  "targets": [
603
  {
604
  "editorMode": "code",
605
- "expr": "sum by (mode, error_code) (rate(antiatropos_executor_errors_total{mode=\"simulated\"}[5m]))",
606
  "legendFormat": "executor errors {{error_code}} ({{mode}})",
607
  "range": true,
608
  "refId": "A"
609
  },
610
  {
611
  "editorMode": "code",
612
- "expr": "histogram_quantile(0.95, sum(rate(antiatropos_executor_latency_ms_bucket{mode=\"simulated\"}[5m])) by (le, mode))",
613
  "legendFormat": "p95 executor latency {{mode}}",
614
  "range": true,
615
  "refId": "B"
@@ -640,3 +640,8 @@
640
  "version": 2,
641
  "weekStart": ""
642
  }
 
 
 
 
 
 
76
  "targets": [
77
  {
78
  "editorMode": "code",
79
+ "expr": "scalar(avg(last_over_time(antiatropos_reward{mode=~\"live|simulated|hybrid|aws\"}[1m])))",
80
+ "legendFormat": "reward (all modes)",
81
  "range": true,
82
  "refId": "A"
83
  }
 
143
  "targets": [
144
  {
145
  "editorMode": "code",
146
+ "expr": "scalar(avg(last_over_time(antiatropos_total_queue_backlog{mode=~\"live|simulated|hybrid|aws\"}[1m])))",
147
+ "legendFormat": "queue backlog (all modes)",
148
  "range": true,
149
  "refId": "A"
150
  }
 
210
  "targets": [
211
  {
212
  "editorMode": "code",
213
+ "expr": "scalar(avg(last_over_time(antiatropos_average_latency_norm{mode=~\"live|simulated|hybrid|aws\"}[1m])))",
214
+ "legendFormat": "latency (all modes)",
215
  "range": true,
216
  "refId": "A"
217
  }
 
277
  "targets": [
278
  {
279
  "editorMode": "code",
280
+ "expr": "scalar(avg(last_over_time(antiatropos_lyapunov_energy{mode=~\"live|simulated|hybrid|aws\"}[1m])))",
281
+ "legendFormat": "lyapunov energy (all modes)",
282
  "range": true,
283
  "refId": "A"
284
  }
 
369
  "targets": [
370
  {
371
  "editorMode": "code",
372
+ "expr": "antiatropos_reward{mode=~\"live|simulated|hybrid|aws\"}",
373
  "legendFormat": "reward {{task_id}} ({{mode}})",
374
  "range": true,
375
  "refId": "A"
376
  },
377
  {
378
  "editorMode": "code",
379
+ "expr": "antiatropos_lyapunov_energy{mode=~\"live|simulated|hybrid|aws\"}",
380
  "legendFormat": "lyapunov {{task_id}} ({{mode}})",
381
  "range": true,
382
  "refId": "B"
 
468
  "targets": [
469
  {
470
  "editorMode": "code",
471
+ "expr": "antiatropos_total_queue_backlog{mode=~\"live|simulated|hybrid|aws\"}",
472
  "legendFormat": "queue {{task_id}} ({{mode}})",
473
  "range": true,
474
  "refId": "A"
475
  },
476
  {
477
  "editorMode": "code",
478
+ "expr": "antiatropos_average_latency_norm{mode=~\"live|simulated|hybrid|aws\"}",
479
  "legendFormat": "latency {{task_id}} ({{mode}})",
480
  "range": true,
481
  "refId": "B"
 
535
  "targets": [
536
  {
537
  "editorMode": "code",
538
+ "expr": "sum by (task_id, mode) (rate(antiatropos_steps_total{mode=~\"live|simulated|hybrid|aws\"}[1m]))",
539
  "legendFormat": "steps/sec {{task_id}} ({{mode}})",
540
  "range": true,
541
  "refId": "A"
542
  },
543
  {
544
  "editorMode": "code",
545
+ "expr": "sum by (task_id, mode, action_type) (rate(antiatropos_actions_total{mode=~\"live|simulated|hybrid|aws\"}[1m]))",
546
  "legendFormat": "actions/sec {{action_type}} ({{task_id}}, {{mode}})",
547
  "range": true,
548
  "refId": "B"
 
602
  "targets": [
603
  {
604
  "editorMode": "code",
605
+ "expr": "sum by (mode, error_code) (rate(antiatropos_executor_errors_total{mode=~\"live|simulated|hybrid|aws\"}[5m]))",
606
  "legendFormat": "executor errors {{error_code}} ({{mode}})",
607
  "range": true,
608
  "refId": "A"
609
  },
610
  {
611
  "editorMode": "code",
612
+ "expr": "histogram_quantile(0.95, sum(rate(antiatropos_executor_latency_ms_bucket{mode=~\"live|simulated|hybrid|aws\"}[5m])) by (le, mode))",
613
  "legendFormat": "p95 executor latency {{mode}}",
614
  "range": true,
615
  "refId": "B"
 
640
  "version": 2,
641
  "weekStart": ""
642
  }
643
+
644
+
645
+
646
+
647
+
deploy/grafana/provisioning/dashboards/json/antiatropos-workloads.json ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "annotations": {
3
+ "list": [
4
+ {
5
+ "builtIn": 1,
6
+ "datasource": {"type": "grafana", "uid": "-- Grafana --"},
7
+ "enable": true,
8
+ "hide": true,
9
+ "iconColor": "rgba(0, 211, 255, 1)",
10
+ "name": "Annotations & Alerts",
11
+ "type": "dashboard"
12
+ }
13
+ ]
14
+ },
15
+ "editable": true,
16
+ "fiscalYearStartMonth": 0,
17
+ "graphTooltip": 1,
18
+ "id": null,
19
+ "links": [],
20
+ "liveNow": false,
21
+ "panels": [
22
+ {
23
+ "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
24
+ "fieldConfig": {
25
+ "defaults": {
26
+ "color": {"mode": "thresholds"},
27
+ "decimals": 1,
28
+ "mappings": [],
29
+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 100}, {"color": "red", "value": 500}]},
30
+ "unit": "reqps"
31
+ },
32
+ "overrides": []
33
+ },
34
+ "gridPos": {"h": 4, "w": 6, "x": 0, "y": 0},
35
+ "id": 1,
36
+ "options": {
37
+ "colorMode": "value",
38
+ "graphMode": "area",
39
+ "justifyMode": "auto",
40
+ "orientation": "auto",
41
+ "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
42
+ "textMode": "auto"
43
+ },
44
+ "targets": [{"expr": "sum(rate(http_requests_total[1m]))", "refId": "A"}],
45
+ "title": "Total Request Rate",
46
+ "type": "stat"
47
+ },
48
+ {
49
+ "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
50
+ "fieldConfig": {
51
+ "defaults": {
52
+ "color": {"mode": "thresholds"},
53
+ "decimals": 3,
54
+ "mappings": [],
55
+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 0.01}, {"color": "red", "value": 0.05}]},
56
+ "unit": "percentunit"
57
+ },
58
+ "overrides": []
59
+ },
60
+ "gridPos": {"h": 4, "w": 6, "x": 6, "y": 0},
61
+ "id": 2,
62
+ "options": {
63
+ "colorMode": "value",
64
+ "graphMode": "area",
65
+ "justifyMode": "auto",
66
+ "orientation": "auto",
67
+ "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
68
+ "textMode": "auto"
69
+ },
70
+ "targets": [{"expr": "sum(rate(http_requests_total{status=~\"5..\"}[1m])) / clamp_min(sum(rate(http_requests_total[1m])), 1)", "refId": "A"}],
71
+ "title": "Global Error Rate",
72
+ "type": "stat"
73
+ },
74
+ {
75
+ "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
76
+ "fieldConfig": {
77
+ "defaults": {
78
+ "color": {"mode": "thresholds"},
79
+ "decimals": 1,
80
+ "mappings": [],
81
+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "orange", "value": 50}, {"color": "red", "value": 100}]},
82
+ "unit": "none"
83
+ },
84
+ "overrides": []
85
+ },
86
+ "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
87
+ "id": 3,
88
+ "options": {
89
+ "colorMode": "value",
90
+ "graphMode": "area",
91
+ "justifyMode": "auto",
92
+ "orientation": "auto",
93
+ "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
94
+ "textMode": "auto"
95
+ },
96
+ "targets": [{"expr": "sum(queue_depth)", "refId": "A"}],
97
+ "title": "Total Queue Backlog",
98
+ "type": "stat"
99
+ },
100
+ {
101
+ "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
102
+ "fieldConfig": {
103
+ "defaults": {
104
+ "color": {"mode": "thresholds"},
105
+ "decimals": 1,
106
+ "mappings": [],
107
+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "orange", "value": 100}, {"color": "red", "value": 200}]},
108
+ "unit": "ms"
109
+ },
110
+ "overrides": []
111
+ },
112
+ "gridPos": {"h": 4, "w": 6, "x": 18, "y": 0},
113
+ "id": 4,
114
+ "options": {
115
+ "colorMode": "value",
116
+ "graphMode": "area",
117
+ "justifyMode": "auto",
118
+ "orientation": "auto",
119
+ "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
120
+ "textMode": "auto"
121
+ },
122
+ "targets": [{"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[1m])) by (le)) * 1000", "refId": "A"}],
123
+ "title": "Cluster p95 Latency",
124
+ "type": "stat"
125
+ },
126
+ {
127
+ "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
128
+ "fieldConfig": {
129
+ "defaults": {
130
+ "color": {"mode": "palette-classic"},
131
+ "custom": {
132
+ "axisBorderShow": false,
133
+ "axisCenteredZero": false,
134
+ "axisColorMode": "text",
135
+ "axisLabel": "",
136
+ "axisPlacement": "auto",
137
+ "barAlignment": 0,
138
+ "drawStyle": "line",
139
+ "fillOpacity": 10,
140
+ "gradientMode": "none",
141
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
142
+ "insertNulls": false,
143
+ "lineInterpolation": "linear",
144
+ "lineWidth": 2,
145
+ "pointSize": 3,
146
+ "scaleDistribution": {"type": "linear"},
147
+ "showPoints": "auto",
148
+ "spanNulls": false,
149
+ "stacking": {"group": "A", "mode": "none"},
150
+ "thresholdsStyle": {"mode": "off"}
151
+ },
152
+ "mappings": [],
153
+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
154
+ "unit": "reqps"
155
+ },
156
+ "overrides": []
157
+ },
158
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 4},
159
+ "id": 10,
160
+ "options": {
161
+ "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
162
+ "tooltip": {"mode": "multi", "sort": "none"}
163
+ },
164
+ "targets": [
165
+ {
166
+ "expr": "sum(rate(http_requests_total[1m])) by (node_id)",
167
+ "legendFormat": "{{node_id}}",
168
+ "refId": "A"
169
+ }
170
+ ],
171
+ "title": "Request Rate by Node",
172
+ "type": "timeseries"
173
+ },
174
+ {
175
+ "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
176
+ "fieldConfig": {
177
+ "defaults": {
178
+ "color": {"mode": "palette-classic"},
179
+ "custom": {
180
+ "axisBorderShow": false,
181
+ "axisCenteredZero": false,
182
+ "axisColorMode": "text",
183
+ "axisLabel": "",
184
+ "axisPlacement": "auto",
185
+ "barAlignment": 0,
186
+ "drawStyle": "line",
187
+ "fillOpacity": 10,
188
+ "gradientMode": "none",
189
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
190
+ "insertNulls": false,
191
+ "lineInterpolation": "linear",
192
+ "lineWidth": 2,
193
+ "pointSize": 3,
194
+ "scaleDistribution": {"type": "linear"},
195
+ "showPoints": "auto",
196
+ "spanNulls": false,
197
+ "stacking": {"group": "A", "mode": "none"},
198
+ "thresholdsStyle": {"mode": "off"}
199
+ },
200
+ "mappings": [],
201
+ "min": 0,
202
+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
203
+ "unit": "percentunit"
204
+ },
205
+ "overrides": []
206
+ },
207
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 4},
208
+ "id": 11,
209
+ "options": {
210
+ "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
211
+ "tooltip": {"mode": "multi", "sort": "none"}
212
+ },
213
+ "targets": [
214
+ {
215
+ "expr": "sum(rate(http_requests_total{status=~\"5..\"}[1m])) by (node_id) / clamp_min(sum(rate(http_requests_total[1m])) by (node_id), 1)",
216
+ "legendFormat": "{{node_id}}",
217
+ "refId": "A"
218
+ }
219
+ ],
220
+ "title": "Error Rate by Node",
221
+ "type": "timeseries"
222
+ },
223
+ {
224
+ "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
225
+ "fieldConfig": {
226
+ "defaults": {
227
+ "color": {"mode": "palette-classic"},
228
+ "custom": {
229
+ "axisBorderShow": false,
230
+ "axisCenteredZero": false,
231
+ "axisColorMode": "text",
232
+ "axisLabel": "",
233
+ "axisPlacement": "auto",
234
+ "barAlignment": 0,
235
+ "drawStyle": "line",
236
+ "fillOpacity": 10,
237
+ "gradientMode": "none",
238
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
239
+ "insertNulls": false,
240
+ "lineInterpolation": "linear",
241
+ "lineWidth": 2,
242
+ "pointSize": 3,
243
+ "scaleDistribution": {"type": "linear"},
244
+ "showPoints": "auto",
245
+ "spanNulls": false,
246
+ "stacking": {"group": "A", "mode": "none"},
247
+ "thresholdsStyle": {"mode": "off"}
248
+ },
249
+ "mappings": [],
250
+ "min": 0,
251
+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
252
+ "unit": "none"
253
+ },
254
+ "overrides": []
255
+ },
256
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 12},
257
+ "id": 12,
258
+ "options": {
259
+ "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
260
+ "tooltip": {"mode": "multi", "sort": "none"}
261
+ },
262
+ "targets": [
263
+ {
264
+ "expr": "avg(queue_depth) by (node_id)",
265
+ "legendFormat": "{{node_id}}",
266
+ "refId": "A"
267
+ }
268
+ ],
269
+ "title": "Queue Depth by Node",
270
+ "type": "timeseries"
271
+ },
272
+ {
273
+ "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
274
+ "fieldConfig": {
275
+ "defaults": {
276
+ "color": {"mode": "palette-classic"},
277
+ "custom": {
278
+ "axisBorderShow": false,
279
+ "axisCenteredZero": false,
280
+ "axisColorMode": "text",
281
+ "axisLabel": "",
282
+ "axisPlacement": "auto",
283
+ "barAlignment": 0,
284
+ "drawStyle": "line",
285
+ "fillOpacity": 10,
286
+ "gradientMode": "none",
287
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
288
+ "insertNulls": false,
289
+ "lineInterpolation": "linear",
290
+ "lineWidth": 2,
291
+ "pointSize": 3,
292
+ "scaleDistribution": {"type": "linear"},
293
+ "showPoints": "auto",
294
+ "spanNulls": false,
295
+ "stacking": {"group": "A", "mode": "none"},
296
+ "thresholdsStyle": {"mode": "off"}
297
+ },
298
+ "mappings": [],
299
+ "min": 0,
300
+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
301
+ "unit": "ms"
302
+ },
303
+ "overrides": []
304
+ },
305
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 12},
306
+ "id": 13,
307
+ "options": {
308
+ "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
309
+ "tooltip": {"mode": "multi", "sort": "none"}
310
+ },
311
+ "targets": [
312
+ {
313
+ "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (node_id, le)) * 1000",
314
+ "legendFormat": "{{node_id}}",
315
+ "refId": "A"
316
+ }
317
+ ],
318
+ "title": "Latency p95 by Node",
319
+ "type": "timeseries"
320
+ },
321
+ {
322
+ "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
323
+ "fieldConfig": {
324
+ "defaults": {
325
+ "color": {"mode": "palette-classic"},
326
+ "custom": {
327
+ "axisBorderShow": false,
328
+ "axisCenteredZero": false,
329
+ "axisColorMode": "text",
330
+ "axisLabel": "",
331
+ "axisPlacement": "auto",
332
+ "barAlignment": 0,
333
+ "drawStyle": "line",
334
+ "fillOpacity": 10,
335
+ "gradientMode": "none",
336
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
337
+ "insertNulls": false,
338
+ "lineInterpolation": "linear",
339
+ "lineWidth": 2,
340
+ "pointSize": 3,
341
+ "scaleDistribution": {"type": "linear"},
342
+ "showPoints": "auto",
343
+ "spanNulls": false,
344
+ "stacking": {"group": "A", "mode": "none"},
345
+ "thresholdsStyle": {"mode": "off"}
346
+ },
347
+ "mappings": [],
348
+ "min": 0,
349
+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
350
+ "unit": "percentunit"
351
+ },
352
+ "overrides": []
353
+ },
354
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20},
355
+ "id": 14,
356
+ "options": {
357
+ "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
358
+ "tooltip": {"mode": "multi", "sort": "none"}
359
+ },
360
+ "targets": [
361
+ {
362
+ "expr": "avg(rate(container_cpu_usage_seconds_total[1m])) by (node_id)",
363
+ "legendFormat": "{{node_id}}",
364
+ "refId": "A"
365
+ }
366
+ ],
367
+ "title": "CPU by Node",
368
+ "type": "timeseries"
369
+ },
370
+ {
371
+ "datasource": {"type": "prometheus", "uid": "PBFA97CFB590B2093"},
372
+ "fieldConfig": {
373
+ "defaults": {
374
+ "color": {"mode": "palette-classic"},
375
+ "custom": {
376
+ "axisBorderShow": false,
377
+ "axisCenteredZero": false,
378
+ "axisColorMode": "text",
379
+ "axisLabel": "",
380
+ "axisPlacement": "auto",
381
+ "barAlignment": 0,
382
+ "drawStyle": "line",
383
+ "fillOpacity": 10,
384
+ "gradientMode": "none",
385
+ "hideFrom": {"legend": false, "tooltip": false, "viz": false},
386
+ "insertNulls": false,
387
+ "lineInterpolation": "linear",
388
+ "lineWidth": 2,
389
+ "pointSize": 3,
390
+ "scaleDistribution": {"type": "linear"},
391
+ "showPoints": "auto",
392
+ "spanNulls": false,
393
+ "stacking": {"group": "A", "mode": "none"},
394
+ "thresholdsStyle": {"mode": "off"}
395
+ },
396
+ "mappings": [],
397
+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
398
+ "unit": "reqps"
399
+ },
400
+ "overrides": []
401
+ },
402
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20},
403
+ "id": 15,
404
+ "options": {
405
+ "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
406
+ "tooltip": {"mode": "multi", "sort": "none"}
407
+ },
408
+ "targets": [
409
+ {
410
+ "expr": "sum(rate(http_requests_total{status=\"200\"}[1m])) by (node_id)",
411
+ "legendFormat": "200 {{node_id}}",
412
+ "refId": "A"
413
+ },
414
+ {
415
+ "expr": "sum(rate(http_requests_total{status=\"500\"}[1m])) by (node_id)",
416
+ "legendFormat": "500 {{node_id}}",
417
+ "refId": "B"
418
+ }
419
+ ],
420
+ "title": "Requests by Status Code",
421
+ "type": "timeseries"
422
+ }
423
+ ],
424
+ "refresh": "5s",
425
+ "schemaVersion": 41,
426
+ "style": "dark",
427
+ "tags": ["antiatropos", "sre", "workload"],
428
+ "templating": {"list": []},
429
+ "time": {"from": "now-15m", "to": "now"},
430
+ "timepicker": {},
431
+ "timezone": "browser",
432
+ "title": "AntiAtropos Workloads",
433
+ "uid": "antiatropos-workloads",
434
+ "version": 1,
435
+ "weekStart": ""
436
+ }
deploy/grafana/provisioning/datasources/prometheus.yaml CHANGED
@@ -5,6 +5,6 @@ datasources:
5
  uid: PBFA97CFB590B2093
6
  type: prometheus
7
  access: proxy
8
- url: http://127.0.0.1:9090/prometheus
9
  isDefault: true
10
- editable: false
 
5
  uid: PBFA97CFB590B2093
6
  type: prometheus
7
  access: proxy
8
+ url: http://127.0.0.1:9090
9
  isDefault: true
10
+ editable: true
deploy/index.html CHANGED
@@ -1,473 +1,473 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
- <meta charset="UTF-8">
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <title>AntiAtropos Control Console</title>
7
- <style>
8
- :root {
9
- --bg: #0b1220;
10
- --bg-soft: #101a2d;
11
- --panel: #111d33;
12
- --line: #2b3d5d;
13
- --text: #e6edf8;
14
- --muted: #9bb0cf;
15
- --accent: #ff5a3d;
16
- --accent-strong: #e14830;
17
- --ok: #3dcf8e;
18
- --bad: #ff6f7f;
19
- }
20
-
21
- * {
22
- box-sizing: border-box;
23
- }
24
-
25
- body {
26
- margin: 0;
27
- padding: 24px;
28
- background:
29
- radial-gradient(circle at top right, rgba(255, 90, 61, 0.18), transparent 40%),
30
- radial-gradient(circle at top left, rgba(74, 140, 255, 0.18), transparent 35%),
31
- var(--bg);
32
- color: var(--text);
33
- font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif;
34
- }
35
-
36
- .shell {
37
- max-width: 1440px;
38
- margin: 0 auto;
39
- display: grid;
40
- gap: 18px;
41
- }
42
-
43
- .card {
44
- background: linear-gradient(180deg, rgba(17, 29, 51, 0.88), rgba(15, 25, 44, 0.92));
45
- border: 1px solid var(--line);
46
- border-radius: 16px;
47
- }
48
-
49
- .header {
50
- padding: 20px 22px;
51
- display: flex;
52
- justify-content: space-between;
53
- align-items: center;
54
- gap: 16px;
55
- flex-wrap: wrap;
56
- }
57
-
58
- .title h1 {
59
- margin: 0;
60
- font-size: 1.5rem;
61
- letter-spacing: 0.01em;
62
- }
63
-
64
- .title p {
65
- margin: 4px 0 0;
66
- color: var(--muted);
67
- font-size: 0.95rem;
68
- }
69
-
70
- .links {
71
- display: flex;
72
- gap: 10px;
73
- flex-wrap: wrap;
74
- }
75
-
76
- .link-btn {
77
- display: inline-flex;
78
- align-items: center;
79
- justify-content: center;
80
- height: 38px;
81
- padding: 0 14px;
82
- border-radius: 10px;
83
- border: 1px solid var(--line);
84
- color: var(--text);
85
- text-decoration: none;
86
- background: var(--bg-soft);
87
- font-size: 0.9rem;
88
- }
89
-
90
- .layout {
91
- display: grid;
92
- grid-template-columns: 1fr;
93
- gap: 18px;
94
- }
95
-
96
- .controls {
97
- padding: 16px;
98
- display: grid;
99
- grid-template-columns: 1fr;
100
- gap: 14px;
101
- }
102
-
103
- .controls-grid {
104
- display: grid;
105
- grid-template-columns: repeat(4, minmax(0, 1fr));
106
- gap: 12px;
107
- align-items: end;
108
- }
109
-
110
- .field label {
111
- display: block;
112
- color: var(--muted);
113
- font-size: 0.78rem;
114
- font-weight: 600;
115
- letter-spacing: 0.04em;
116
- margin-bottom: 6px;
117
- text-transform: uppercase;
118
- }
119
-
120
- .field select,
121
- .field input {
122
- width: 100%;
123
- height: 44px;
124
- border-radius: 10px;
125
- border: 1px solid var(--line);
126
- background: #0c162a;
127
- color: var(--text);
128
- padding: 0 12px;
129
- font-size: 0.95rem;
130
- }
131
-
132
- .actions {
133
- display: grid;
134
- grid-template-columns: 180px 1fr;
135
- gap: 10px;
136
- }
137
-
138
- .btn {
139
- border: 1px solid var(--line);
140
- border-radius: 10px;
141
- height: 44px;
142
- cursor: pointer;
143
- font-weight: 600;
144
- font-size: 0.95rem;
145
- color: var(--text);
146
- background: var(--bg-soft);
147
- }
148
-
149
- .btn-primary {
150
- background: linear-gradient(135deg, var(--accent), var(--accent-strong));
151
- border-color: transparent;
152
- color: #fff;
153
- }
154
-
155
- .metrics {
156
- padding: 16px;
157
- display: grid;
158
- grid-template-columns: repeat(5, minmax(0, 1fr));
159
- gap: 10px;
160
- }
161
-
162
- .metric {
163
- background: #0d172a;
164
- border: 1px solid var(--line);
165
- border-radius: 12px;
166
- padding: 12px;
167
- min-height: 86px;
168
- }
169
-
170
- .metric .name {
171
- color: var(--muted);
172
- font-size: 0.78rem;
173
- text-transform: uppercase;
174
- letter-spacing: 0.05em;
175
- margin-bottom: 8px;
176
- }
177
-
178
- .metric .value {
179
- font-family: Consolas, "SFMono-Regular", Menlo, monospace;
180
- font-size: 1.18rem;
181
- font-weight: 700;
182
- color: var(--text);
183
- }
184
-
185
- .metric .value.good {
186
- color: var(--ok);
187
- }
188
-
189
- .metric .value.bad {
190
- color: var(--bad);
191
- }
192
-
193
- .monitor {
194
- padding: 16px;
195
- display: grid;
196
- gap: 10px;
197
- }
198
-
199
- .monitor-head {
200
- display: flex;
201
- justify-content: space-between;
202
- align-items: center;
203
- gap: 12px;
204
- flex-wrap: wrap;
205
- }
206
-
207
- .monitor-head h2 {
208
- margin: 0;
209
- font-size: 1.05rem;
210
- font-weight: 700;
211
- }
212
-
213
- .monitor-head p {
214
- margin: 0;
215
- color: var(--muted);
216
- font-size: 0.85rem;
217
- }
218
-
219
- .graph-wrap {
220
- height: 920px;
221
- border: 1px solid var(--line);
222
- border-radius: 12px;
223
- overflow: hidden;
224
- background: #0a1324;
225
- }
226
-
227
- iframe {
228
- width: 100%;
229
- height: 100%;
230
- border: 0;
231
- }
232
-
233
- .logs {
234
- padding: 16px;
235
- }
236
-
237
- .logs h3 {
238
- margin: 0 0 10px;
239
- font-size: 0.9rem;
240
- color: var(--muted);
241
- text-transform: uppercase;
242
- letter-spacing: 0.05em;
243
- }
244
-
245
- #terminal {
246
- background: #091121;
247
- border: 1px solid var(--line);
248
- border-radius: 10px;
249
- height: 160px;
250
- overflow-y: auto;
251
- padding: 10px;
252
- font-family: Consolas, "SFMono-Regular", Menlo, monospace;
253
- font-size: 0.83rem;
254
- color: #c9d6ed;
255
- }
256
-
257
- .log-line {
258
- padding: 2px 0;
259
- border-bottom: 1px solid rgba(155, 176, 207, 0.08);
260
- }
261
-
262
- .log-time {
263
- color: #7084a8;
264
- margin-right: 8px;
265
- font-size: 0.72rem;
266
- }
267
-
268
- @media (max-width: 1120px) {
269
- .controls-grid {
270
- grid-template-columns: 1fr 1fr;
271
- }
272
-
273
- .actions {
274
- grid-template-columns: 1fr;
275
- }
276
-
277
- .metrics {
278
- grid-template-columns: 1fr 1fr;
279
- }
280
- }
281
-
282
- @media (max-width: 680px) {
283
- body {
284
- padding: 12px;
285
- }
286
-
287
- .controls-grid,
288
- .metrics {
289
- grid-template-columns: 1fr;
290
- }
291
-
292
- .graph-wrap {
293
- height: 760px;
294
- }
295
- }
296
- </style>
297
- </head>
298
- <body>
299
- <div class="shell">
300
- <header class="card header">
301
- <div class="title">
302
- <h1>AntiAtropos SRE Control Console</h1>
303
- <p>Simulated environment with direct observability through Prometheus and Grafana</p>
304
- </div>
305
- <div class="links">
306
- <a class="link-btn" href="/docs" target="_blank">API Docs</a>
307
- <a class="link-btn" href="/prometheus/" target="_blank">Open Prometheus</a>
308
- <a class="link-btn" href="/grafana/" target="_blank">Open Grafana</a>
309
- </div>
310
- </header>
311
-
312
- <main class="layout">
313
- <section class="card controls">
314
- <div class="controls-grid">
315
- <div class="field">
316
- <label for="action-type">Action Type</label>
317
- <select id="action-type">
318
- <option value="NO_OP">NO_OP</option>
319
- <option value="SCALE_UP">SCALE_UP</option>
320
- <option value="SCALE_DOWN">SCALE_DOWN</option>
321
- <option value="REROUTE_TRAFFIC">REROUTE_TRAFFIC</option>
322
- <option value="SHED_LOAD">SHED_LOAD</option>
323
- </select>
324
- </div>
325
- <div class="field">
326
- <label for="node-id">Target Node</label>
327
- <select id="node-id">
328
- <option value="node-0">node-0 (VIP)</option>
329
- <option value="node-1">node-1</option>
330
- <option value="node-2">node-2</option>
331
- <option value="node-3">node-3</option>
332
- <option value="node-4">node-4</option>
333
- </select>
334
- </div>
335
- <div class="field">
336
- <label for="parameter">Parameter</label>
337
- <input id="parameter" type="number" step="0.1" value="0.0">
338
- </div>
339
- <div class="actions">
340
- <button class="btn btn-primary" onclick="resetEnv()">Reset Episode</button>
341
- <button class="btn" onclick="stepEnv()">Execute Step</button>
342
- </div>
343
- </div>
344
- </section>
345
-
346
- <section class="card metrics">
347
- <div class="metric">
348
- <div class="name">Cluster ID</div>
349
- <div id="cluster-id" class="value">---</div>
350
- </div>
351
- <div class="metric">
352
- <div class="name">Reward</div>
353
- <div id="last-reward" class="value">0.0000</div>
354
- </div>
355
- <div class="metric">
356
- <div class="name">Lyapunov Energy</div>
357
- <div id="lyapunov-val" class="value">0.0000</div>
358
- </div>
359
- <div class="metric">
360
- <div class="name">Mode</div>
361
- <div id="mode-val" class="value">simulated</div>
362
- </div>
363
- <div class="metric">
364
- <div class="name">Step</div>
365
- <div id="step-val" class="value">0</div>
366
- </div>
367
- </section>
368
-
369
- <section class="card monitor">
370
- <div class="monitor-head">
371
- <h2>Required Graphs</h2>
372
- <p>Raw metrics source: Prometheus. Curated dashboard: Grafana.</p>
373
- </div>
374
- <div class="graph-wrap">
375
- <iframe
376
- id="grafana-iframe"
377
- src="/grafana/d/antiatropos-overview/antiatropos-overview?kiosk&theme=dark&refresh=5s&from=now-30m&to=now">
378
- </iframe>
379
- </div>
380
- </section>
381
-
382
- <section class="card logs">
383
- <h3>System Logs</h3>
384
- <div id="terminal">
385
- <div class="log-line"><span class="log-time">[init]</span>Waiting for interaction.</div>
386
- </div>
387
- </section>
388
- </main>
389
- </div>
390
-
391
- <script>
392
- const terminal = document.getElementById("terminal");
393
-
394
- function log(message, type = "info") {
395
- const time = new Date().toLocaleTimeString([], {
396
- hour12: false,
397
- hour: "2-digit",
398
- minute: "2-digit",
399
- second: "2-digit"
400
- });
401
- const row = document.createElement("div");
402
- row.className = "log-line";
403
- const color = type === "error" ? "#ff6f7f" : type === "success" ? "#3dcf8e" : "#c9d6ed";
404
- row.innerHTML = '<span class="log-time">[' + time + "]</span><span style=\"color:" + color + "\">" + message + "</span>";
405
- terminal.appendChild(row);
406
- terminal.scrollTop = terminal.scrollHeight;
407
- }
408
-
409
- function updateUI(data) {
410
- const observation = data.observation || {};
411
- const rewardNode = document.getElementById("last-reward");
412
- const reward = typeof data.reward === "number" ? data.reward : 0;
413
-
414
- document.getElementById("cluster-id").innerText = (observation.cluster_id || "---").toString().slice(0, 12);
415
- document.getElementById("lyapunov-val").innerText = Number(observation.lyapunov_energy || 0).toFixed(4);
416
- document.getElementById("mode-val").innerText = (observation.mode || "simulated").toString();
417
- document.getElementById("step-val").innerText = String(observation.step || 0);
418
-
419
- rewardNode.innerText = reward.toFixed(4);
420
- rewardNode.className = reward < 0 ? "value bad" : "value good";
421
- }
422
-
423
- async function resetEnv() {
424
- log("Resetting environment...");
425
- try {
426
- const response = await fetch("/reset", {
427
- method: "POST",
428
- headers: { "Content-Type": "application/json" },
429
- body: JSON.stringify({})
430
- });
431
- const data = await response.json();
432
- updateUI(data);
433
- log("Environment reset complete.", "success");
434
- } catch (err) {
435
- log("Reset failed: " + err.message, "error");
436
- }
437
- }
438
-
439
- async function stepEnv() {
440
- const action = {
441
- action_type: document.getElementById("action-type").value,
442
- target_node_id: document.getElementById("node-id").value,
443
- parameter: parseFloat(document.getElementById("parameter").value)
444
- };
445
-
446
- log("Dispatching " + action.action_type + " to " + action.target_node_id + " (" + action.parameter + ")");
447
-
448
- try {
449
- const response = await fetch("/step", {
450
- method: "POST",
451
- headers: { "Content-Type": "application/json" },
452
- body: JSON.stringify({ action: action })
453
- });
454
- const data = await response.json();
455
-
456
- if (data.detail) {
457
- log("Invalid payload: " + JSON.stringify(data.detail), "error");
458
- return;
459
- }
460
-
461
- updateUI(data);
462
- log(
463
- "Step complete. Reward=" + Number(data.reward || 0).toFixed(3) +
464
- " Lyapunov=" + Number((data.observation || {}).lyapunov_energy || 0).toFixed(3),
465
- "success"
466
- );
467
- } catch (err) {
468
- log("Execution failed: " + err.message, "error");
469
- }
470
- }
471
- </script>
472
- </body>
473
- </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>AntiAtropos Control Console</title>
7
+ <style>
8
+ :root {
9
+ --bg: #0b1220;
10
+ --bg-soft: #101a2d;
11
+ --panel: #111d33;
12
+ --line: #2b3d5d;
13
+ --text: #e6edf8;
14
+ --muted: #9bb0cf;
15
+ --accent: #ff5a3d;
16
+ --accent-strong: #e14830;
17
+ --ok: #3dcf8e;
18
+ --bad: #ff6f7f;
19
+ }
20
+
21
+ * {
22
+ box-sizing: border-box;
23
+ }
24
+
25
+ body {
26
+ margin: 0;
27
+ padding: 24px;
28
+ background:
29
+ radial-gradient(circle at top right, rgba(255, 90, 61, 0.18), transparent 40%),
30
+ radial-gradient(circle at top left, rgba(74, 140, 255, 0.18), transparent 35%),
31
+ var(--bg);
32
+ color: var(--text);
33
+ font-family: "Segoe UI", "Helvetica Neue", Arial, sans-serif;
34
+ }
35
+
36
+ .shell {
37
+ max-width: 1440px;
38
+ margin: 0 auto;
39
+ display: grid;
40
+ gap: 18px;
41
+ }
42
+
43
+ .card {
44
+ background: linear-gradient(180deg, rgba(17, 29, 51, 0.88), rgba(15, 25, 44, 0.92));
45
+ border: 1px solid var(--line);
46
+ border-radius: 16px;
47
+ }
48
+
49
+ .header {
50
+ padding: 20px 22px;
51
+ display: flex;
52
+ justify-content: space-between;
53
+ align-items: center;
54
+ gap: 16px;
55
+ flex-wrap: wrap;
56
+ }
57
+
58
+ .title h1 {
59
+ margin: 0;
60
+ font-size: 1.5rem;
61
+ letter-spacing: 0.01em;
62
+ }
63
+
64
+ .title p {
65
+ margin: 4px 0 0;
66
+ color: var(--muted);
67
+ font-size: 0.95rem;
68
+ }
69
+
70
+ .links {
71
+ display: flex;
72
+ gap: 10px;
73
+ flex-wrap: wrap;
74
+ }
75
+
76
+ .link-btn {
77
+ display: inline-flex;
78
+ align-items: center;
79
+ justify-content: center;
80
+ height: 38px;
81
+ padding: 0 14px;
82
+ border-radius: 10px;
83
+ border: 1px solid var(--line);
84
+ color: var(--text);
85
+ text-decoration: none;
86
+ background: var(--bg-soft);
87
+ font-size: 0.9rem;
88
+ }
89
+
90
+ .layout {
91
+ display: grid;
92
+ grid-template-columns: 1fr;
93
+ gap: 18px;
94
+ }
95
+
96
+ .controls {
97
+ padding: 16px;
98
+ display: grid;
99
+ grid-template-columns: 1fr;
100
+ gap: 14px;
101
+ }
102
+
103
+ .controls-grid {
104
+ display: grid;
105
+ grid-template-columns: repeat(4, minmax(0, 1fr));
106
+ gap: 12px;
107
+ align-items: end;
108
+ }
109
+
110
+ .field label {
111
+ display: block;
112
+ color: var(--muted);
113
+ font-size: 0.78rem;
114
+ font-weight: 600;
115
+ letter-spacing: 0.04em;
116
+ margin-bottom: 6px;
117
+ text-transform: uppercase;
118
+ }
119
+
120
+ .field select,
121
+ .field input {
122
+ width: 100%;
123
+ height: 44px;
124
+ border-radius: 10px;
125
+ border: 1px solid var(--line);
126
+ background: #0c162a;
127
+ color: var(--text);
128
+ padding: 0 12px;
129
+ font-size: 0.95rem;
130
+ }
131
+
132
+ .actions {
133
+ display: grid;
134
+ grid-template-columns: 180px 1fr;
135
+ gap: 10px;
136
+ }
137
+
138
+ .btn {
139
+ border: 1px solid var(--line);
140
+ border-radius: 10px;
141
+ height: 44px;
142
+ cursor: pointer;
143
+ font-weight: 600;
144
+ font-size: 0.95rem;
145
+ color: var(--text);
146
+ background: var(--bg-soft);
147
+ }
148
+
149
+ .btn-primary {
150
+ background: linear-gradient(135deg, var(--accent), var(--accent-strong));
151
+ border-color: transparent;
152
+ color: #fff;
153
+ }
154
+
155
+ .metrics {
156
+ padding: 16px;
157
+ display: grid;
158
+ grid-template-columns: repeat(5, minmax(0, 1fr));
159
+ gap: 10px;
160
+ }
161
+
162
+ .metric {
163
+ background: #0d172a;
164
+ border: 1px solid var(--line);
165
+ border-radius: 12px;
166
+ padding: 12px;
167
+ min-height: 86px;
168
+ }
169
+
170
+ .metric .name {
171
+ color: var(--muted);
172
+ font-size: 0.78rem;
173
+ text-transform: uppercase;
174
+ letter-spacing: 0.05em;
175
+ margin-bottom: 8px;
176
+ }
177
+
178
+ .metric .value {
179
+ font-family: Consolas, "SFMono-Regular", Menlo, monospace;
180
+ font-size: 1.18rem;
181
+ font-weight: 700;
182
+ color: var(--text);
183
+ }
184
+
185
+ .metric .value.good {
186
+ color: var(--ok);
187
+ }
188
+
189
+ .metric .value.bad {
190
+ color: var(--bad);
191
+ }
192
+
193
+ .monitor {
194
+ padding: 16px;
195
+ display: grid;
196
+ gap: 10px;
197
+ }
198
+
199
+ .monitor-head {
200
+ display: flex;
201
+ justify-content: space-between;
202
+ align-items: center;
203
+ gap: 12px;
204
+ flex-wrap: wrap;
205
+ }
206
+
207
+ .monitor-head h2 {
208
+ margin: 0;
209
+ font-size: 1.05rem;
210
+ font-weight: 700;
211
+ }
212
+
213
+ .monitor-head p {
214
+ margin: 0;
215
+ color: var(--muted);
216
+ font-size: 0.85rem;
217
+ }
218
+
219
+ .graph-wrap {
220
+ height: 920px;
221
+ border: 1px solid var(--line);
222
+ border-radius: 12px;
223
+ overflow: hidden;
224
+ background: #0a1324;
225
+ }
226
+
227
+ iframe {
228
+ width: 100%;
229
+ height: 100%;
230
+ border: 0;
231
+ }
232
+
233
+ .logs {
234
+ padding: 16px;
235
+ }
236
+
237
+ .logs h3 {
238
+ margin: 0 0 10px;
239
+ font-size: 0.9rem;
240
+ color: var(--muted);
241
+ text-transform: uppercase;
242
+ letter-spacing: 0.05em;
243
+ }
244
+
245
+ #terminal {
246
+ background: #091121;
247
+ border: 1px solid var(--line);
248
+ border-radius: 10px;
249
+ height: 160px;
250
+ overflow-y: auto;
251
+ padding: 10px;
252
+ font-family: Consolas, "SFMono-Regular", Menlo, monospace;
253
+ font-size: 0.83rem;
254
+ color: #c9d6ed;
255
+ }
256
+
257
+ .log-line {
258
+ padding: 2px 0;
259
+ border-bottom: 1px solid rgba(155, 176, 207, 0.08);
260
+ }
261
+
262
+ .log-time {
263
+ color: #7084a8;
264
+ margin-right: 8px;
265
+ font-size: 0.72rem;
266
+ }
267
+
268
+ @media (max-width: 1120px) {
269
+ .controls-grid {
270
+ grid-template-columns: 1fr 1fr;
271
+ }
272
+
273
+ .actions {
274
+ grid-template-columns: 1fr;
275
+ }
276
+
277
+ .metrics {
278
+ grid-template-columns: 1fr 1fr;
279
+ }
280
+ }
281
+
282
+ @media (max-width: 680px) {
283
+ body {
284
+ padding: 12px;
285
+ }
286
+
287
+ .controls-grid,
288
+ .metrics {
289
+ grid-template-columns: 1fr;
290
+ }
291
+
292
+ .graph-wrap {
293
+ height: 760px;
294
+ }
295
+ }
296
+ </style>
297
+ </head>
298
+ <body>
299
+ <div class="shell">
300
+ <header class="card header">
301
+ <div class="title">
302
+ <h1>AntiAtropos SRE Control Console</h1>
303
+ <p>Simulated environment with direct observability through Prometheus and Grafana</p>
304
+ </div>
305
+ <div class="links">
306
+ <a class="link-btn" href="/docs" target="_blank">API Docs</a>
307
+ <a class="link-btn" href="/prometheus/" target="_blank">Open Prometheus</a>
308
+ <a class="link-btn" href="/grafana/" target="_blank">Open Grafana</a>
309
+ </div>
310
+ </header>
311
+
312
+ <main class="layout">
313
+ <section class="card controls">
314
+ <div class="controls-grid">
315
+ <div class="field">
316
+ <label for="action-type">Action Type</label>
317
+ <select id="action-type">
318
+ <option value="NO_OP">NO_OP</option>
319
+ <option value="SCALE_UP">SCALE_UP</option>
320
+ <option value="SCALE_DOWN">SCALE_DOWN</option>
321
+ <option value="REROUTE_TRAFFIC">REROUTE_TRAFFIC</option>
322
+ <option value="SHED_LOAD">SHED_LOAD</option>
323
+ </select>
324
+ </div>
325
+ <div class="field">
326
+ <label for="node-id">Target Node</label>
327
+ <select id="node-id">
328
+ <option value="node-0">node-0 (VIP)</option>
329
+ <option value="node-1">node-1</option>
330
+ <option value="node-2">node-2</option>
331
+ <option value="node-3">node-3</option>
332
+ <option value="node-4">node-4</option>
333
+ </select>
334
+ </div>
335
+ <div class="field">
336
+ <label for="parameter">Parameter</label>
337
+ <input id="parameter" type="number" step="0.1" value="0.0">
338
+ </div>
339
+ <div class="actions">
340
+ <button class="btn btn-primary" onclick="resetEnv()">Reset Episode</button>
341
+ <button class="btn" onclick="stepEnv()">Execute Step</button>
342
+ </div>
343
+ </div>
344
+ </section>
345
+
346
+ <section class="card metrics">
347
+ <div class="metric">
348
+ <div class="name">Cluster ID</div>
349
+ <div id="cluster-id" class="value">---</div>
350
+ </div>
351
+ <div class="metric">
352
+ <div class="name">Reward</div>
353
+ <div id="last-reward" class="value">0.0000</div>
354
+ </div>
355
+ <div class="metric">
356
+ <div class="name">Lyapunov Energy</div>
357
+ <div id="lyapunov-val" class="value">0.0000</div>
358
+ </div>
359
+ <div class="metric">
360
+ <div class="name">Mode</div>
361
+ <div id="mode-val" class="value">simulated</div>
362
+ </div>
363
+ <div class="metric">
364
+ <div class="name">Step</div>
365
+ <div id="step-val" class="value">0</div>
366
+ </div>
367
+ </section>
368
+
369
+ <section class="card monitor">
370
+ <div class="monitor-head">
371
+ <h2>Required Graphs</h2>
372
+ <p>Raw metrics source: Prometheus. Curated dashboard: Grafana.</p>
373
+ </div>
374
+ <div class="graph-wrap">
375
+ <iframe
376
+ id="grafana-iframe"
377
+ src="/grafana/d/antiatropos-overview/antiatropos-overview?kiosk&theme=dark&refresh=5s&from=now-30m&to=now">
378
+ </iframe>
379
+ </div>
380
+ </section>
381
+
382
+ <section class="card logs">
383
+ <h3>System Logs</h3>
384
+ <div id="terminal">
385
+ <div class="log-line"><span class="log-time">[init]</span>Waiting for interaction.</div>
386
+ </div>
387
+ </section>
388
+ </main>
389
+ </div>
390
+
391
+ <script>
392
+ const terminal = document.getElementById("terminal");
393
+
394
+ function log(message, type = "info") {
395
+ const time = new Date().toLocaleTimeString([], {
396
+ hour12: false,
397
+ hour: "2-digit",
398
+ minute: "2-digit",
399
+ second: "2-digit"
400
+ });
401
+ const row = document.createElement("div");
402
+ row.className = "log-line";
403
+ const color = type === "error" ? "#ff6f7f" : type === "success" ? "#3dcf8e" : "#c9d6ed";
404
+ row.innerHTML = '<span class="log-time">[' + time + "]</span><span style=\"color:" + color + "\">" + message + "</span>";
405
+ terminal.appendChild(row);
406
+ terminal.scrollTop = terminal.scrollHeight;
407
+ }
408
+
409
+ function updateUI(data) {
410
+ const observation = data.observation || {};
411
+ const rewardNode = document.getElementById("last-reward");
412
+ const reward = typeof data.reward === "number" ? data.reward : 0;
413
+
414
+ document.getElementById("cluster-id").innerText = (observation.cluster_id || "---").toString().slice(0, 12);
415
+ document.getElementById("lyapunov-val").innerText = Number(observation.lyapunov_energy || 0).toFixed(4);
416
+ document.getElementById("mode-val").innerText = (observation.mode || "simulated").toString();
417
+ document.getElementById("step-val").innerText = String(observation.step || 0);
418
+
419
+ rewardNode.innerText = reward.toFixed(4);
420
+ rewardNode.className = reward < 0 ? "value bad" : "value good";
421
+ }
422
+
423
+ async function resetEnv() {
424
+ log("Resetting environment...");
425
+ try {
426
+ const response = await fetch("/reset", {
427
+ method: "POST",
428
+ headers: { "Content-Type": "application/json" },
429
+ body: JSON.stringify({})
430
+ });
431
+ const data = await response.json();
432
+ updateUI(data);
433
+ log("Environment reset complete.", "success");
434
+ } catch (err) {
435
+ log("Reset failed: " + err.message, "error");
436
+ }
437
+ }
438
+
439
+ async function stepEnv() {
440
+ const action = {
441
+ action_type: document.getElementById("action-type").value,
442
+ target_node_id: document.getElementById("node-id").value,
443
+ parameter: parseFloat(document.getElementById("parameter").value)
444
+ };
445
+
446
+ log("Dispatching " + action.action_type + " to " + action.target_node_id + " (" + action.parameter + ")");
447
+
448
+ try {
449
+ const response = await fetch("/step", {
450
+ method: "POST",
451
+ headers: { "Content-Type": "application/json" },
452
+ body: JSON.stringify({ action: action })
453
+ });
454
+ const data = await response.json();
455
+
456
+ if (data.detail) {
457
+ log("Invalid payload: " + JSON.stringify(data.detail), "error");
458
+ return;
459
+ }
460
+
461
+ updateUI(data);
462
+ log(
463
+ "Step complete. Reward=" + Number(data.reward || 0).toFixed(3) +
464
+ " Lyapunov=" + Number((data.observation || {}).lyapunov_energy || 0).toFixed(3),
465
+ "success"
466
+ );
467
+ } catch (err) {
468
+ log("Execution failed: " + err.message, "error");
469
+ }
470
+ }
471
+ </script>
472
+ </body>
473
+ </html>
deploy/kind-maxpods-250.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ kind: Cluster
2
+ apiVersion: kind.x-k8s.io/v1alpha4
3
+ name: antiatropos-local
4
+ nodes:
5
+ - role: control-plane
6
+ kubeadmConfigPatches:
7
+ - |
8
+ kind: InitConfiguration
9
+ nodeRegistration:
10
+ kubeletExtraArgs:
11
+ max-pods: "250"
deploy/local-laptop.yaml ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: v1
2
+ kind: Namespace
3
+ metadata:
4
+ name: prod-sre
5
+ ---
6
+ apiVersion: apps/v1
7
+ kind: Deployment
8
+ metadata:
9
+ name: auth
10
+ namespace: prod-sre
11
+ spec:
12
+ replicas: 1
13
+ selector:
14
+ matchLabels:
15
+ app: auth
16
+ template:
17
+ metadata:
18
+ labels:
19
+ app: auth
20
+ annotations:
21
+ prometheus.io/scrape: "true"
22
+ prometheus.io/port: "8080"
23
+ prometheus.io/path: "/metrics.txt"
24
+ spec:
25
+ containers:
26
+ - name: auth
27
+ image: python:3.12-alpine
28
+ env:
29
+ - name: NODE_ID
30
+ value: node-4
31
+ - name: BASE_QUEUE
32
+ value: "6"
33
+ command: ["/bin/sh", "-lc"]
34
+ args:
35
+ - |
36
+ mkdir -p /www
37
+ echo ok > /www/index.html
38
+ python -m http.server 8080 --directory /www >/tmp/http.log 2>&1 &
39
+ req=0; err=0; cpu_total=0
40
+ while true; do
41
+ t=$(date +%s)
42
+ noise=$((t % 11))
43
+ req=$((req + 30 + noise))
44
+ q=$((BASE_QUEUE + (t % 20) - 10))
45
+ if [ "$q" -lt 0 ]; then q=0; fi
46
+ err=$((err + q / 20))
47
+ cpu_inc=$((10 + q / 10))
48
+ cpu_total=$((cpu_total + cpu_inc))
49
+ lat_ms=$((35 + q * 3))
50
+ b005=$((req / 5)); b01=$((req / 3)); b025=$((req / 2)); b05=$((req * 3 / 4)); b1=$req; b2=$req
51
+ lat_sum=$(awk "BEGIN {printf \"%.3f\", $req * $lat_ms / 1000.0}")
52
+ {
53
+ echo "# HELP http_requests_total Synthetic request counter"
54
+ echo "# TYPE http_requests_total counter"
55
+ echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"200\"} ${req}"
56
+ echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"500\"} ${err}"
57
+ echo "# HELP queue_depth Synthetic queue depth"
58
+ echo "# TYPE queue_depth gauge"
59
+ echo "queue_depth{node_id=\"${NODE_ID}\"} ${q}"
60
+ echo "# HELP container_cpu_usage_seconds_total Synthetic CPU counter"
61
+ echo "# TYPE container_cpu_usage_seconds_total counter"
62
+ echo "container_cpu_usage_seconds_total{node_id=\"${NODE_ID}\"} ${cpu_total}"
63
+ echo "# HELP http_request_duration_seconds Synthetic request duration histogram"
64
+ echo "# TYPE http_request_duration_seconds histogram"
65
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.05\"} ${b005}"
66
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.1\"} ${b01}"
67
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.25\"} ${b025}"
68
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.5\"} ${b05}"
69
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"1\"} ${b1}"
70
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"2\"} ${b2}"
71
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
72
+ echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
73
+ echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
74
+ } > /www/metrics.txt
75
+ sleep 2
76
+ done
77
+ ---
78
+ apiVersion: apps/v1
79
+ kind: Deployment
80
+ metadata:
81
+ name: cart
82
+ namespace: prod-sre
83
+ spec:
84
+ replicas: 1
85
+ selector:
86
+ matchLabels:
87
+ app: cart
88
+ template:
89
+ metadata:
90
+ labels:
91
+ app: cart
92
+ annotations:
93
+ prometheus.io/scrape: "true"
94
+ prometheus.io/port: "8080"
95
+ prometheus.io/path: "/metrics.txt"
96
+ spec:
97
+ containers:
98
+ - name: cart
99
+ image: python:3.12-alpine
100
+ env:
101
+ - name: NODE_ID
102
+ value: node-3
103
+ - name: BASE_QUEUE
104
+ value: "14"
105
+ command: ["/bin/sh", "-lc"]
106
+ args:
107
+ - |
108
+ mkdir -p /www
109
+ echo ok > /www/index.html
110
+ python -m http.server 8080 --directory /www >/tmp/http.log 2>&1 &
111
+ req=0; err=0; cpu_total=0
112
+ while true; do
113
+ t=$(date +%s)
114
+ noise=$((t % 11))
115
+ req=$((req + 30 + noise))
116
+ q=$((BASE_QUEUE + (t % 20) - 10))
117
+ if [ "$q" -lt 0 ]; then q=0; fi
118
+ err=$((err + q / 20))
119
+ cpu_inc=$((10 + q / 10))
120
+ cpu_total=$((cpu_total + cpu_inc))
121
+ lat_ms=$((35 + q * 3))
122
+ b005=$((req / 5)); b01=$((req / 3)); b025=$((req / 2)); b05=$((req * 3 / 4)); b1=$req; b2=$req
123
+ lat_sum=$(awk "BEGIN {printf \"%.3f\", $req * $lat_ms / 1000.0}")
124
+ {
125
+ echo "# HELP http_requests_total Synthetic request counter"
126
+ echo "# TYPE http_requests_total counter"
127
+ echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"200\"} ${req}"
128
+ echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"500\"} ${err}"
129
+ echo "# HELP queue_depth Synthetic queue depth"
130
+ echo "# TYPE queue_depth gauge"
131
+ echo "queue_depth{node_id=\"${NODE_ID}\"} ${q}"
132
+ echo "# HELP container_cpu_usage_seconds_total Synthetic CPU counter"
133
+ echo "# TYPE container_cpu_usage_seconds_total counter"
134
+ echo "container_cpu_usage_seconds_total{node_id=\"${NODE_ID}\"} ${cpu_total}"
135
+ echo "# HELP http_request_duration_seconds Synthetic request duration histogram"
136
+ echo "# TYPE http_request_duration_seconds histogram"
137
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.05\"} ${b005}"
138
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.1\"} ${b01}"
139
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.25\"} ${b025}"
140
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.5\"} ${b05}"
141
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"1\"} ${b1}"
142
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"2\"} ${b2}"
143
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
144
+ echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
145
+ echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
146
+ } > /www/metrics.txt
147
+ sleep 2
148
+ done
149
+ ---
150
+ apiVersion: apps/v1
151
+ kind: Deployment
152
+ metadata:
153
+ name: catalog
154
+ namespace: prod-sre
155
+ spec:
156
+ replicas: 1
157
+ selector:
158
+ matchLabels:
159
+ app: catalog
160
+ template:
161
+ metadata:
162
+ labels:
163
+ app: catalog
164
+ annotations:
165
+ prometheus.io/scrape: "true"
166
+ prometheus.io/port: "8080"
167
+ prometheus.io/path: "/metrics.txt"
168
+ spec:
169
+ containers:
170
+ - name: catalog
171
+ image: python:3.12-alpine
172
+ env:
173
+ - name: NODE_ID
174
+ value: node-2
175
+ - name: BASE_QUEUE
176
+ value: "20"
177
+ command: ["/bin/sh", "-lc"]
178
+ args:
179
+ - |
180
+ mkdir -p /www
181
+ echo ok > /www/index.html
182
+ python -m http.server 8080 --directory /www >/tmp/http.log 2>&1 &
183
+ req=0; err=0; cpu_total=0
184
+ while true; do
185
+ t=$(date +%s)
186
+ noise=$((t % 11))
187
+ req=$((req + 30 + noise))
188
+ q=$((BASE_QUEUE + (t % 20) - 10))
189
+ if [ "$q" -lt 0 ]; then q=0; fi
190
+ err=$((err + q / 20))
191
+ cpu_inc=$((10 + q / 10))
192
+ cpu_total=$((cpu_total + cpu_inc))
193
+ lat_ms=$((35 + q * 3))
194
+ b005=$((req / 5)); b01=$((req / 3)); b025=$((req / 2)); b05=$((req * 3 / 4)); b1=$req; b2=$req
195
+ lat_sum=$(awk "BEGIN {printf \"%.3f\", $req * $lat_ms / 1000.0}")
196
+ {
197
+ echo "# HELP http_requests_total Synthetic request counter"
198
+ echo "# TYPE http_requests_total counter"
199
+ echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"200\"} ${req}"
200
+ echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"500\"} ${err}"
201
+ echo "# HELP queue_depth Synthetic queue depth"
202
+ echo "# TYPE queue_depth gauge"
203
+ echo "queue_depth{node_id=\"${NODE_ID}\"} ${q}"
204
+ echo "# HELP container_cpu_usage_seconds_total Synthetic CPU counter"
205
+ echo "# TYPE container_cpu_usage_seconds_total counter"
206
+ echo "container_cpu_usage_seconds_total{node_id=\"${NODE_ID}\"} ${cpu_total}"
207
+ echo "# HELP http_request_duration_seconds Synthetic request duration histogram"
208
+ echo "# TYPE http_request_duration_seconds histogram"
209
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.05\"} ${b005}"
210
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.1\"} ${b01}"
211
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.25\"} ${b025}"
212
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.5\"} ${b05}"
213
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"1\"} ${b1}"
214
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"2\"} ${b2}"
215
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
216
+ echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
217
+ echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
218
+ } > /www/metrics.txt
219
+ sleep 2
220
+ done
221
+ ---
222
+ apiVersion: apps/v1
223
+ kind: Deployment
224
+ metadata:
225
+ name: checkout
226
+ namespace: prod-sre
227
+ spec:
228
+ replicas: 1
229
+ selector:
230
+ matchLabels:
231
+ app: checkout
232
+ template:
233
+ metadata:
234
+ labels:
235
+ app: checkout
236
+ annotations:
237
+ prometheus.io/scrape: "true"
238
+ prometheus.io/port: "8080"
239
+ prometheus.io/path: "/metrics.txt"
240
+ spec:
241
+ containers:
242
+ - name: checkout
243
+ image: python:3.12-alpine
244
+ env:
245
+ - name: NODE_ID
246
+ value: node-1
247
+ - name: BASE_QUEUE
248
+ value: "24"
249
+ command: ["/bin/sh", "-lc"]
250
+ args:
251
+ - |
252
+ mkdir -p /www
253
+ echo ok > /www/index.html
254
+ python -m http.server 8080 --directory /www >/tmp/http.log 2>&1 &
255
+ req=0; err=0; cpu_total=0
256
+ while true; do
257
+ t=$(date +%s)
258
+ noise=$((t % 11))
259
+ req=$((req + 30 + noise))
260
+ q=$((BASE_QUEUE + (t % 20) - 10))
261
+ if [ "$q" -lt 0 ]; then q=0; fi
262
+ err=$((err + q / 20))
263
+ cpu_inc=$((10 + q / 10))
264
+ cpu_total=$((cpu_total + cpu_inc))
265
+ lat_ms=$((35 + q * 3))
266
+ b005=$((req / 5)); b01=$((req / 3)); b025=$((req / 2)); b05=$((req * 3 / 4)); b1=$req; b2=$req
267
+ lat_sum=$(awk "BEGIN {printf \"%.3f\", $req * $lat_ms / 1000.0}")
268
+ {
269
+ echo "# HELP http_requests_total Synthetic request counter"
270
+ echo "# TYPE http_requests_total counter"
271
+ echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"200\"} ${req}"
272
+ echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"500\"} ${err}"
273
+ echo "# HELP queue_depth Synthetic queue depth"
274
+ echo "# TYPE queue_depth gauge"
275
+ echo "queue_depth{node_id=\"${NODE_ID}\"} ${q}"
276
+ echo "# HELP container_cpu_usage_seconds_total Synthetic CPU counter"
277
+ echo "# TYPE container_cpu_usage_seconds_total counter"
278
+ echo "container_cpu_usage_seconds_total{node_id=\"${NODE_ID}\"} ${cpu_total}"
279
+ echo "# HELP http_request_duration_seconds Synthetic request duration histogram"
280
+ echo "# TYPE http_request_duration_seconds histogram"
281
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.05\"} ${b005}"
282
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.1\"} ${b01}"
283
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.25\"} ${b025}"
284
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.5\"} ${b05}"
285
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"1\"} ${b1}"
286
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"2\"} ${b2}"
287
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
288
+ echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
289
+ echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
290
+ } > /www/metrics.txt
291
+ sleep 2
292
+ done
293
+ ---
294
+ apiVersion: apps/v1
295
+ kind: Deployment
296
+ metadata:
297
+ name: payments
298
+ namespace: prod-sre
299
+ spec:
300
+ replicas: 2
301
+ selector:
302
+ matchLabels:
303
+ app: payments
304
+ template:
305
+ metadata:
306
+ labels:
307
+ app: payments
308
+ annotations:
309
+ prometheus.io/scrape: "true"
310
+ prometheus.io/port: "8080"
311
+ prometheus.io/path: "/metrics.txt"
312
+ spec:
313
+ containers:
314
+ - name: payments
315
+ image: python:3.12-alpine
316
+ env:
317
+ - name: NODE_ID
318
+ value: node-0
319
+ - name: BASE_QUEUE
320
+ value: "30"
321
+ command: ["/bin/sh", "-lc"]
322
+ args:
323
+ - |
324
+ mkdir -p /www
325
+ echo ok > /www/index.html
326
+ python -m http.server 8080 --directory /www >/tmp/http.log 2>&1 &
327
+ req=0; err=0; cpu_total=0
328
+ while true; do
329
+ t=$(date +%s)
330
+ noise=$((t % 11))
331
+ req=$((req + 30 + noise))
332
+ q=$((BASE_QUEUE + (t % 20) - 10))
333
+ if [ "$q" -lt 0 ]; then q=0; fi
334
+ err=$((err + q / 20))
335
+ cpu_inc=$((10 + q / 10))
336
+ cpu_total=$((cpu_total + cpu_inc))
337
+ lat_ms=$((35 + q * 3))
338
+ b005=$((req / 5)); b01=$((req / 3)); b025=$((req / 2)); b05=$((req * 3 / 4)); b1=$req; b2=$req
339
+ lat_sum=$(awk "BEGIN {printf \"%.3f\", $req * $lat_ms / 1000.0}")
340
+ {
341
+ echo "# HELP http_requests_total Synthetic request counter"
342
+ echo "# TYPE http_requests_total counter"
343
+ echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"200\"} ${req}"
344
+ echo "http_requests_total{node_id=\"${NODE_ID}\",status=\"500\"} ${err}"
345
+ echo "# HELP queue_depth Synthetic queue depth"
346
+ echo "# TYPE queue_depth gauge"
347
+ echo "queue_depth{node_id=\"${NODE_ID}\"} ${q}"
348
+ echo "# HELP container_cpu_usage_seconds_total Synthetic CPU counter"
349
+ echo "# TYPE container_cpu_usage_seconds_total counter"
350
+ echo "container_cpu_usage_seconds_total{node_id=\"${NODE_ID}\"} ${cpu_total}"
351
+ echo "# HELP http_request_duration_seconds Synthetic request duration histogram"
352
+ echo "# TYPE http_request_duration_seconds histogram"
353
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.05\"} ${b005}"
354
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.1\"} ${b01}"
355
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.25\"} ${b025}"
356
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"0.5\"} ${b05}"
357
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"1\"} ${b1}"
358
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"2\"} ${b2}"
359
+ echo "http_request_duration_seconds_bucket{node_id=\"${NODE_ID}\",le=\"+Inf\"} ${req}"
360
+ echo "http_request_duration_seconds_count{node_id=\"${NODE_ID}\"} ${req}"
361
+ echo "http_request_duration_seconds_sum{node_id=\"${NODE_ID}\"} ${lat_sum}"
362
+ } > /www/metrics.txt
363
+ sleep 2
364
+ done
365
+
deploy/local/datasource-local.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: 1
2
+
3
+ datasources:
4
+ - name: Prometheus
5
+ uid: PBFA97CFB590B2093
6
+ type: prometheus
7
+ access: proxy
8
+ url: http://prometheus-local:9090
9
+ isDefault: true
10
+ editable: true
deploy/local/grafana-local-values.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ adminUser: admin
2
+ adminPassword: antiatropos
3
+
4
+ service:
5
+ type: ClusterIP
6
+
7
+ persistence:
8
+ enabled: false
9
+
10
+ resources:
11
+ requests:
12
+ cpu: 100m
13
+ memory: 192Mi
14
+ limits:
15
+ cpu: 400m
16
+ memory: 384Mi
17
+
18
+ datasources:
19
+ datasources.yaml:
20
+ apiVersion: 1
21
+ datasources:
22
+ - name: Prometheus
23
+ type: prometheus
24
+ access: proxy
25
+ url: http://prometheus-server.monitoring.svc.cluster.local
26
+ isDefault: true
27
+ editable: true
28
+
29
+ sidecar:
30
+ dashboards:
31
+ enabled: true
32
+ label: grafana_dashboard
33
+ labelValue: "1"
34
+ searchNamespace: ALL
deploy/local/prometheus-local-values.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ alertmanager:
2
+ enabled: false
3
+
4
+ kube-state-metrics:
5
+ enabled: false
6
+
7
+ prometheus-node-exporter:
8
+ enabled: false
9
+
10
+ prometheus-pushgateway:
11
+ enabled: false
12
+
13
+ extraScrapeConfigs: |
14
+ - job_name: 'antiatropos-fastapi'
15
+ metrics_path: /metrics
16
+ static_configs:
17
+ - targets: ['host.docker.internal:8000']
18
+
19
+ - job_name: 'prod-sre-annotated-pods'
20
+ kubernetes_sd_configs:
21
+ - role: pod
22
+ namespaces:
23
+ names: ['prod-sre']
24
+ relabel_configs:
25
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
26
+ action: keep
27
+ regex: true
28
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
29
+ action: replace
30
+ target_label: __metrics_path__
31
+ regex: (.+)
32
+ - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
33
+ action: replace
34
+ regex: ([^:]+)(?::\d+)?;(\d+)
35
+ replacement: $1:$2
36
+ target_label: __address__
37
+
38
+ server:
39
+ persistentVolume:
40
+ enabled: false
41
+ resources:
42
+ requests:
43
+ cpu: 100m
44
+ memory: 256Mi
45
+ limits:
46
+ cpu: 500m
47
+ memory: 512Mi
48
+ service:
49
+ type: ClusterIP
deploy/nginx.conf CHANGED
@@ -1,89 +1,89 @@
1
- worker_processes auto;
2
- pid /tmp/nginx.pid;
3
- error_log /dev/stderr info;
4
-
5
- events {
6
- worker_connections 1024;
7
- }
8
-
9
- http {
10
- include /etc/nginx/mime.types;
11
- default_type application/octet-stream;
12
- sendfile on;
13
- keepalive_timeout 65;
14
- access_log /dev/stdout;
15
-
16
- map $http_upgrade $connection_upgrade {
17
- default upgrade;
18
- '' close;
19
- }
20
-
21
- server {
22
- listen 7860;
23
- server_name _;
24
-
25
- client_max_body_size 50m;
26
- proxy_read_timeout 3600s;
27
- proxy_send_timeout 3600s;
28
-
29
- location = /prometheus {
30
- return 301 /prometheus/;
31
- }
32
-
33
- location = /grafana {
34
- return 301 /grafana/;
35
- }
36
-
37
- location /prometheus/ {
38
- proxy_pass http://127.0.0.1:9090;
39
- proxy_http_version 1.1;
40
- proxy_set_header Host $host;
41
- proxy_set_header X-Real-IP $remote_addr;
42
- proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
43
- proxy_set_header X-Forwarded-Host $host;
44
- proxy_set_header X-Forwarded-Proto $scheme;
45
- proxy_set_header X-Forwarded-Prefix /prometheus;
46
- }
47
-
48
- location /grafana/ {
49
- proxy_pass http://127.0.0.1:3000;
50
- proxy_http_version 1.1;
51
- proxy_set_header Host $host;
52
- proxy_set_header X-Real-IP $remote_addr;
53
- proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
54
- proxy_set_header X-Forwarded-Host $host;
55
- proxy_set_header X-Forwarded-Proto $scheme;
56
- proxy_set_header X-Forwarded-Prefix /grafana;
57
- }
58
-
59
- location /grafana/api/live/ {
60
- proxy_pass http://127.0.0.1:3000;
61
- proxy_http_version 1.1;
62
- proxy_set_header Upgrade $http_upgrade;
63
- proxy_set_header Connection $connection_upgrade;
64
- proxy_set_header Host $host;
65
- proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
66
- proxy_set_header X-Forwarded-Host $host;
67
- proxy_set_header X-Forwarded-Proto $scheme;
68
- proxy_set_header X-Forwarded-Prefix /grafana;
69
- }
70
-
71
- location / {
72
- root /var/www/html;
73
- index index.html;
74
- try_files $uri $uri/ @fastapi;
75
- }
76
-
77
- location @fastapi {
78
- proxy_pass http://127.0.0.1:8000;
79
- proxy_http_version 1.1;
80
- proxy_set_header Host $host;
81
- proxy_set_header X-Real-IP $remote_addr;
82
- proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
83
- proxy_set_header X-Forwarded-Host $host;
84
- proxy_set_header X-Forwarded-Proto $scheme;
85
- proxy_set_header Upgrade $http_upgrade;
86
- proxy_set_header Connection $connection_upgrade;
87
- }
88
- }
89
- }
 
1
+ worker_processes auto;
2
+ pid /tmp/nginx.pid;
3
+ error_log /dev/stderr info;
4
+
5
+ events {
6
+ worker_connections 1024;
7
+ }
8
+
9
+ http {
10
+ include /etc/nginx/mime.types;
11
+ default_type application/octet-stream;
12
+ sendfile on;
13
+ keepalive_timeout 65;
14
+ access_log /dev/stdout;
15
+
16
+ map $http_upgrade $connection_upgrade {
17
+ default upgrade;
18
+ '' close;
19
+ }
20
+
21
+ server {
22
+ listen 7860;
23
+ server_name _;
24
+
25
+ client_max_body_size 50m;
26
+ proxy_read_timeout 3600s;
27
+ proxy_send_timeout 3600s;
28
+
29
+ location = /prometheus {
30
+ return 301 /prometheus/;
31
+ }
32
+
33
+ location = /grafana {
34
+ return 301 /grafana/;
35
+ }
36
+
37
+ location /prometheus/ {
38
+ proxy_pass http://127.0.0.1:9090;
39
+ proxy_http_version 1.1;
40
+ proxy_set_header Host $host;
41
+ proxy_set_header X-Real-IP $remote_addr;
42
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
43
+ proxy_set_header X-Forwarded-Host $host;
44
+ proxy_set_header X-Forwarded-Proto $scheme;
45
+ proxy_set_header X-Forwarded-Prefix /prometheus;
46
+ }
47
+
48
+ location /grafana/ {
49
+ proxy_pass http://127.0.0.1:3000;
50
+ proxy_http_version 1.1;
51
+ proxy_set_header Host $host;
52
+ proxy_set_header X-Real-IP $remote_addr;
53
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
54
+ proxy_set_header X-Forwarded-Host $host;
55
+ proxy_set_header X-Forwarded-Proto $scheme;
56
+ proxy_set_header X-Forwarded-Prefix /grafana;
57
+ }
58
+
59
+ location /grafana/api/live/ {
60
+ proxy_pass http://127.0.0.1:3000;
61
+ proxy_http_version 1.1;
62
+ proxy_set_header Upgrade $http_upgrade;
63
+ proxy_set_header Connection $connection_upgrade;
64
+ proxy_set_header Host $host;
65
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
66
+ proxy_set_header X-Forwarded-Host $host;
67
+ proxy_set_header X-Forwarded-Proto $scheme;
68
+ proxy_set_header X-Forwarded-Prefix /grafana;
69
+ }
70
+
71
+ location / {
72
+ root /var/www/html;
73
+ index index.html;
74
+ try_files $uri $uri/ @fastapi;
75
+ }
76
+
77
+ location @fastapi {
78
+ proxy_pass http://127.0.0.1:8000;
79
+ proxy_http_version 1.1;
80
+ proxy_set_header Host $host;
81
+ proxy_set_header X-Real-IP $remote_addr;
82
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
83
+ proxy_set_header X-Forwarded-Host $host;
84
+ proxy_set_header X-Forwarded-Proto $scheme;
85
+ proxy_set_header Upgrade $http_upgrade;
86
+ proxy_set_header Connection $connection_upgrade;
87
+ }
88
+ }
89
+ }