Krishna1107's picture
full devops
c8f3b98
"""Task: Kubernetes Pod Failures — MEDIUM.
Agent fixes common pod failure scenarios:
OOMKilled, ImagePullBackOff, wrong command, missing ConfigMap, liveness probe.
"""
from server.models import TaskDifficulty
from server.tasks.base import BaseTask
class K8sPodTask(BaseTask):
NAME = "Kubernetes Pod Failures"
DESCRIPTION = "Fix Kubernetes pod failures including CrashLoopBackOff, ImagePullBackOff, and resource issues"
DIFFICULTY = TaskDifficulty.MEDIUM
AVAILABLE_SECRETS = []
SCENARIOS = [
# Scenario 1: CrashLoopBackOff — OOMKilled (memory limit too low)
{
"id": "oom_killed",
"files": [
{
"path": "k8s/deployment.yaml",
"type": "kubernetes",
"content": (
"apiVersion: apps/v1\n"
"kind: Deployment\n"
"metadata:\n"
" name: api-server\n"
"spec:\n"
" replicas: 3\n"
" selector:\n"
" matchLabels:\n"
" app: api\n"
" template:\n"
" metadata:\n"
" labels:\n"
" app: api\n"
" spec:\n"
" containers:\n"
" - name: api\n"
' image: myapp:v1.2.3\n'
" resources:\n"
" limits:\n"
' memory: "64Mi"\n'
' cpu: "100m"\n'
" ports:\n"
" - containerPort: 8080\n"
),
}
],
"error": {
"phase": "k8s_runtime",
"message": (
"$ kubectl get pods\n"
"NAME READY STATUS RESTARTS AGE\n"
"api-server-7d4b8c9f5-x2k9m 0/1 CrashLoopBackOff 5 3m\n"
"\n"
"$ kubectl describe pod api-server-7d4b8c9f5-x2k9m\n"
"...\n"
"State: Waiting\n"
" Reason: CrashLoopBackOff\n"
"Last State: Terminated\n"
" Reason: OOMKilled\n"
" Exit Code: 137\n"
"...\n"
"Events:\n"
" Warning OOMKilling 3m kubelet Memory limit 64Mi exceeded"
),
},
"expected_fixes": [
{
"file": "k8s/deployment.yaml",
"type": "contains",
"expected": 'memory: "256Mi"',
"hint": "Container is OOMKilled with 64Mi limit. The app needs at least 256Mi.",
}
],
},
# Scenario 2: ImagePullBackOff — image tag typo
{
"id": "image_pull_backoff",
"files": [
{
"path": "k8s/deployment.yaml",
"type": "kubernetes",
"content": (
"apiVersion: apps/v1\n"
"kind: Deployment\n"
"metadata:\n"
" name: web-app\n"
"spec:\n"
" replicas: 2\n"
" selector:\n"
" matchLabels:\n"
" app: web\n"
" template:\n"
" metadata:\n"
" labels:\n"
" app: web\n"
" spec:\n"
" containers:\n"
" - name: web\n"
" image: nginx:latset\n"
" ports:\n"
" - containerPort: 80\n"
),
}
],
"error": {
"phase": "k8s_runtime",
"message": (
"$ kubectl get pods\n"
"NAME READY STATUS RESTARTS AGE\n"
"web-app-5f8d7b6c4-abc12 0/1 ImagePullBackOff 0 2m\n"
"\n"
"$ kubectl describe pod web-app-5f8d7b6c4-abc12\n"
"...\n"
"Events:\n"
' Warning Failed 2m kubelet Failed to pull image "nginx:latset": '
"rpc error: code = NotFound desc = failed to pull and unpack image: "
"reference not found\n"
" Warning Failed 2m kubelet Error: ImagePullBackOff\n"
"..."
),
},
"expected_fixes": [
{
"file": "k8s/deployment.yaml",
"type": "contains",
"expected": "image: nginx:latest",
"hint": "Image tag has a typo: 'latset' should be 'latest'",
}
],
},
# Scenario 3: CrashLoopBackOff — wrong command
{
"id": "wrong_command",
"files": [
{
"path": "k8s/deployment.yaml",
"type": "kubernetes",
"content": (
"apiVersion: apps/v1\n"
"kind: Deployment\n"
"metadata:\n"
" name: worker\n"
"spec:\n"
" replicas: 1\n"
" selector:\n"
" matchLabels:\n"
" app: worker\n"
" template:\n"
" metadata:\n"
" labels:\n"
" app: worker\n"
" spec:\n"
" containers:\n"
" - name: worker\n"
" image: python:3.11-slim\n"
" command: [\"python\", \"workers.py\"]\n"
" resources:\n"
" limits:\n"
' memory: "512Mi"\n'
' cpu: "500m"\n'
),
},
{
"path": "app/worker.py",
"type": "other",
"content": (
"import time\n"
"\n"
"def main():\n"
" while True:\n"
" print('Processing...')\n"
" time.sleep(5)\n"
"\n"
"if __name__ == '__main__':\n"
" main()\n"
),
},
],
"error": {
"phase": "k8s_runtime",
"message": (
"$ kubectl get pods\n"
"NAME READY STATUS RESTARTS AGE\n"
"worker-6b8f9d7c4-kj3m2 0/1 CrashLoopBackOff 4 2m\n"
"\n"
"$ kubectl logs worker-6b8f9d7c4-kj3m2\n"
"python: can't open file '/workers.py': [Errno 2] No such file or directory\n"
"\n"
"$ kubectl describe pod worker-6b8f9d7c4-kj3m2\n"
"...\n"
"State: Waiting\n"
" Reason: CrashLoopBackOff\n"
"Last State: Terminated\n"
" Reason: Error\n"
" Exit Code: 2\n"
"..."
),
},
"expected_fixes": [
{
"file": "k8s/deployment.yaml",
"type": "contains",
"expected": 'command: ["python", "worker.py"]',
"hint": "The command references 'workers.py' but the file is named 'worker.py' (no 's')",
}
],
},
# Scenario 4: CreateContainerConfigError — missing ConfigMap
{
"id": "missing_configmap",
"files": [
{
"path": "k8s/deployment.yaml",
"type": "kubernetes",
"content": (
"apiVersion: apps/v1\n"
"kind: Deployment\n"
"metadata:\n"
" name: backend\n"
"spec:\n"
" replicas: 2\n"
" selector:\n"
" matchLabels:\n"
" app: backend\n"
" template:\n"
" metadata:\n"
" labels:\n"
" app: backend\n"
" spec:\n"
" containers:\n"
" - name: backend\n"
" image: mybackend:v2.0\n"
" ports:\n"
" - containerPort: 8080\n"
" envFrom:\n"
" - configMapRef:\n"
" name: app-config\n"
" resources:\n"
" limits:\n"
' memory: "512Mi"\n'
' cpu: "500m"\n'
),
},
],
"error": {
"phase": "k8s_runtime",
"message": (
"$ kubectl get pods\n"
"NAME READY STATUS RESTARTS AGE\n"
"backend-5c9d8f7b6-lm4n5 0/1 CreateContainerConfigError 0 1m\n"
"\n"
"$ kubectl describe pod backend-5c9d8f7b6-lm4n5\n"
"...\n"
"Events:\n"
' Warning Failed 1m kubelet Error: configmap "app-config" not found\n'
"..."
),
},
"expected_fixes": [
{
"file": "k8s/configmap.yaml",
"type": "contains",
"expected": "name: app-config",
"hint": "The ConfigMap 'app-config' is referenced but doesn't exist. Create a ConfigMap manifest.",
}
],
},
# Scenario 5: Pod not ready — liveness probe failing
{
"id": "liveness_probe_failing",
"files": [
{
"path": "k8s/deployment.yaml",
"type": "kubernetes",
"content": (
"apiVersion: apps/v1\n"
"kind: Deployment\n"
"metadata:\n"
" name: api\n"
"spec:\n"
" replicas: 2\n"
" selector:\n"
" matchLabels:\n"
" app: api\n"
" template:\n"
" metadata:\n"
" labels:\n"
" app: api\n"
" spec:\n"
" containers:\n"
" - name: api\n"
" image: myapi:v3.1\n"
" ports:\n"
" - containerPort: 8080\n"
" livenessProbe:\n"
" httpGet:\n"
" path: /healthz\n"
" port: 3000\n"
" initialDelaySeconds: 5\n"
" periodSeconds: 10\n"
" readinessProbe:\n"
" httpGet:\n"
" path: /ready\n"
" port: 8080\n"
" initialDelaySeconds: 5\n"
" periodSeconds: 10\n"
" resources:\n"
" limits:\n"
' memory: "512Mi"\n'
' cpu: "500m"\n'
),
},
],
"error": {
"phase": "k8s_runtime",
"message": (
"$ kubectl get pods\n"
"NAME READY STATUS RESTARTS AGE\n"
"api-7f8d9c6b5-gh7j8 0/1 Running 3 (30s ago) 2m\n"
"\n"
"$ kubectl describe pod api-7f8d9c6b5-gh7j8\n"
"...\n"
"Events:\n"
" Warning Unhealthy 90s kubelet Liveness probe failed: "
"Get \"http://10.244.0.5:3000/healthz\": dial tcp 10.244.0.5:3000: "
"connect: connection refused\n"
" Normal Killing 90s kubelet Container api failed liveness probe, "
"will be restarted\n"
"...\n"
"\n"
"Note: The application listens on port 8080, not 3000."
),
},
"expected_fixes": [
{
"file": "k8s/deployment.yaml",
"type": "contains",
"expected": "port: 8080\n initialDelaySeconds: 5\n periodSeconds: 10\n readinessProbe:",
"hint": "The liveness probe port (3000) doesn't match the container port (8080). Change liveness probe port to 8080.",
}
],
},
]