File size: 17,956 Bytes
18f9970
74dfd77
 
 
 
 
 
44f306a
74dfd77
 
 
 
18f9970
 
 
748ed82
74dfd77
18f9970
 
eaf3506
 
 
 
 
 
 
 
 
 
 
 
74dfd77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18f9970
74dfd77
18f9970
74dfd77
 
 
 
 
c033961
 
 
 
 
 
 
74dfd77
 
 
 
 
 
c033961
 
74dfd77
 
 
 
 
 
 
 
 
 
 
18f9970
74dfd77
 
44f306a
74dfd77
 
 
44f306a
 
 
 
 
 
 
 
 
 
 
 
 
 
74dfd77
 
18f9970
 
74dfd77
 
 
 
 
 
 
 
 
 
18f9970
74dfd77
 
 
 
 
 
44f306a
74dfd77
 
44f306a
 
 
 
 
74dfd77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c033961
44f306a
 
c033961
 
 
44f306a
 
 
 
c033961
 
 
 
 
 
44f306a
 
 
c033961
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74dfd77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18f9970
44f306a
 
 
 
 
 
 
 
 
 
18f9970
 
74dfd77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18f9970
 
74dfd77
 
 
 
eaf3506
18f9970
74dfd77
 
eaf3506
 
 
 
 
18f9970
 
74dfd77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb0313f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b06368
 
cd27210
9b06368
18f9970
74dfd77
 
 
 
 
eaf3506
18f9970
74dfd77
 
 
eaf3506
 
 
18f9970
74dfd77
 
 
 
 
 
 
 
748ed82
74dfd77
 
 
18f9970
 
74dfd77
 
 
 
18f9970
 
 
 
 
 
74dfd77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748ed82
 
 
 
 
74dfd77
748ed82
18f9970
74dfd77
44f306a
 
 
 
74dfd77
 
 
 
 
 
748ed82
 
74dfd77
 
 
 
748ed82
 
74dfd77
 
 
 
748ed82
 
74dfd77
 
 
 
 
 
 
18f9970
74dfd77
18f9970
 
74dfd77
 
18f9970
74dfd77
18f9970
74dfd77
 
 
 
 
18f9970
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
# models.py
# Phase 2 β€” All Pydantic models for FirewatchEnv.
# Every field has explicit type annotations. No Any (except FirewatchAction.parameters).
# Field names follow OpenTelemetry semantic conventions.
#
# Models defined here:
#   1. ServiceMetrics β€” per-service telemetry snapshot (21 OTel fields)
#   2. Alert β€” Simplified LLM-friendly alert (Prometheus Alertmanager-inspired)
#   3. SystemObservation β€” complete observable state (returned by reset/step/state)
#   4. FirewatchAction β€” agent command with strict Literal action_type
#   5. ActionResult β€” structured result of an action
#   6. derive_status() β€” utility to compute status from metric thresholds

from __future__ import annotations

from typing import Literal

from pydantic import BaseModel, Field

# OpenEnv base types β€” provide done, reward, metadata fields
# required by the HTTP server's serialize_observation() and deserialize_action()
try:
    from openenv.core.env_server.types import (
        Observation as _ObservationBase,
        Action as _ActionBase,
    )
except ImportError:
    # Fallback for environments where openenv-core is not installed
    _ObservationBase = BaseModel  # type: ignore[assignment,misc]
    _ActionBase = BaseModel  # type: ignore[assignment,misc]

try:
    from .config import (
        STATUS_THRESHOLD_CRITICAL_ERROR,
        STATUS_THRESHOLD_CRITICAL_LATENCY,
        STATUS_THRESHOLD_DEGRADED_ERROR,
        STATUS_THRESHOLD_DEGRADED_LATENCY,
        STATUS_THRESHOLD_DOWN_ERROR,
        STATUS_THRESHOLD_DOWN_MEMORY,
    )
except ImportError:
    from config import (
        STATUS_THRESHOLD_CRITICAL_ERROR,
        STATUS_THRESHOLD_CRITICAL_LATENCY,
        STATUS_THRESHOLD_DEGRADED_ERROR,
        STATUS_THRESHOLD_DEGRADED_LATENCY,
        STATUS_THRESHOLD_DOWN_ERROR,
        STATUS_THRESHOLD_DOWN_MEMORY,
    )


# --------------------------------------------------------------------------
# Type aliases for readability
# --------------------------------------------------------------------------

ServiceStatus = Literal["healthy", "degraded", "critical", "down"]

AlertName = Literal[
    "HighErrorRate",
    "HighLatency",
    "MemoryPressure",
    "HighCPU",
    "ServiceDown",
    "RequestBacklog",
]

AlertSeverity = Literal["warning", "critical", "page"]

ActionType = Literal[
    # Investigation actions β€” reveal information, no state mutation
    "fetch_logs",
    "get_metrics_detail",
    "trace_dependencies",
    # Advanced diagnostic investigation actions (SPEC-9)
    "strace_process",
    "profiler_dump",
    "check_gc_pressure",
    "trace_distributed_request",
    "inspect_thread_pool",
    "inspect_commit_diff",
    # Remediation actions β€” mutate system state
    "restart_service",
    "rollback_deploy",
    "revert_config",
    "scale_replicas",
    "circuit_break",
    # Advanced remediation actions (SPEC-9)
    "traffic_shift",
    # Meta actions β€” episode control
    "declare_resolved",
    "escalate",
]


# --------------------------------------------------------------------------
# ServiceMetrics β€” per-service telemetry (replaces Phase 1 ServiceSnapshot)
# --------------------------------------------------------------------------

class ServiceMetrics(BaseModel):
    """
    Complete telemetry snapshot for one microservice.

    Field names follow OpenTelemetry semantic conventions (v1.23.0+ stable).
    Underscore naming is the Pydantic convention; each field documents
    the corresponding OTel dot-notation name.

    OTel convention notes:
    - HTTP metrics use the stable http.server.* namespace (since v1.23.0, Feb 2024)
    - Process metrics use the development process.* namespace
    - JVM/runtime metrics use the old experimental process.runtime.jvm.* namespace
      rather than the newer stable jvm.* naming. Reason: env was designed against
      the process.runtime conventions stable through 2023; renaming mid-project
      would break grader fixtures.
    - GC pause duration uses milliseconds (_ms suffix) as a deliberate deviation
      from OTel's seconds-preferred guideline, chosen to make log output and
      diagnostic values human-readable in the agent prompt.
    - http_server_request_duration_p99 is a scalar p99 projection from the OTel
      histogram (http.server.request.duration). Real OTel emits a histogram;
      the scalar is a simulation convenience.

    Status is NOT auto-computed β€” the simulation sets it explicitly
    via derive_status() after mutating metrics each tick.
    """

    # --- Resource attributes (OTel resource) ---
    service_name: str = Field(
        ..., description="OTel: service.name. e.g. 'payment-service'"
    )
    service_version: str = Field(
        default="v1.0.0", description="OTel: service.version"
    )
    service_instance_id: str = Field(
        ..., description="OTel: service.instance.id. e.g. 'payment-7d9f8b-xkp2m'"
    )

    # --- Derived status ---
    status: ServiceStatus = Field(
        default="healthy",
        description="Derived from metric thresholds. Set by simulation via derive_status().",
    )

    # --- HTTP server metrics (OTel stable since v1.23.0, Feb 2024) ---
    http_server_request_duration_p99: float = Field(
        default=0.1,
        description=(
            "OTel stable: http.server.request.duration (histogram p99 projection). "
            "Unit: seconds per OTel General Metrics Guidelines. "
            "Healthy: 0.05–0.5s. Critical: >2.0s."
        ),
    )
    http_server_error_rate: float = Field(
        default=0.0,
        description="Derived from OTel http.response.status_code 5xx ratio. Unit: ratio 0.0–1.0.",
    )
    http_server_active_requests: int = Field(
        default=50,
        description="OTel: http.server.active_requests. Unit: {request}. Normal: 1–200.",
    )

    # --- Process metrics (OTel) ---
    process_cpu_utilization: float = Field(
        default=0.15,
        description="OTel: process.cpu.utilization. Unit: ratio 0.0–1.0 (NOT percentage).",
    )
    process_memory_usage_bytes: int = Field(
        default=178257920,
        description="OTel: process.memory.usage. Unit: bytes. ~170MB default.",
    )
    process_memory_limit_bytes: int = Field(
        default=536870912,
        description="Container config, not OTel-emitted. Unit: bytes. 512MB default.",
    )
    process_memory_utilization: float = Field(
        default=0.33,
        description="Derived: usage_bytes / limit_bytes. Can exceed 1.0 before OOMKill.",
    )
    process_open_file_descriptors: int = Field(
        default=120,
        description="OTel: process.open_file_descriptor.count. High = connection exhaustion.",
    )

    # --- Runtime performance metrics (JVM/V8/Go runtime) ---
    # NOTE: Field names use the old experimental process.runtime.jvm.* namespace
    # rather than the newer stable jvm.* naming. See class docstring for rationale.
    runtime_gc_pause_duration_ms: float = Field(
        default=15.0,
        description=(
            "OTel experimental: process.runtime.jvm.gc.pause_duration "
            "(stable equivalent: jvm.gc.duration histogram p99 projection). "
            "Unit: milliseconds (deliberate deviation from OTel seconds-preferred "
            "guideline β€” chosen for human-readable prompt output). "
            "Healthy: <50ms. Critical: >500ms."
        ),
    )
    runtime_gc_count_per_second: float = Field(
        default=2.0,
        description=(
            "Derived from OTel jvm.gc.duration histogram count rate. "
            "Unit: {gc}/s. Not a standard OTel metric β€” computed from "
            "the histogram's count/sum/min/max. "
            "Healthy: <5. Thrashing: >30."
        ),
    )
    runtime_jvm_threads_count: int = Field(
        default=50,
        description=(
            "OTel: runtime.jvm.threads.count. "
            "Unit: {thread}. Active threads. "
            "Saturated when == max_threads."
        ),
    )
    runtime_jvm_threads_max: int = Field(
        default=200,
        description=(
            "OTel: Configured max thread pool size. "
            "Saturation = threads_count >= threads_max."
        ),
    )
    runtime_thread_pool_queue_depth: int = Field(
        default=0,
        description=(
            "OTel-adjacent: Pending requests in thread pool queue. "
            "High value = backpressure, head-of-line blocking."
        ),
    )

    # --- Runtime / deployment metadata ---
    runtime_uptime_seconds: int = Field(
        default=86400,
        description="OTel: process.runtime.uptime. Resets to 0 on restart. 24h default.",
    )
    restart_count: int = Field(
        default=0,
        description="OTel-adjacent: k8s.container.restart_count. Increments on OOMKill.",
    )
    last_deployment_sha: str = Field(
        default="a3f9d21",
        description="Short git SHA of last deployment.",
    )
    last_deployment_age_seconds: int = Field(
        default=172800,
        description="Seconds since last deployment. Low = recent deploy = suspect for bad_deploy.",
    )
    last_config_revision: int = Field(
        default=1,
        description="Monotonically increasing config revision number.",
    )
    last_config_age_seconds: int = Field(
        default=259200,
        description="Seconds since last config change. Low = suspect for config_drift.",
    )

    # --- Logs (populated only after fetch_logs action) ---
    recent_logs: list[str] = Field(
        default_factory=list,
        description="Empty by default. Populated by fetch_logs action. Last 20 log lines.",
    )


# --------------------------------------------------------------------------
# Alert β€” Prometheus Alertmanager format
# --------------------------------------------------------------------------

class Alert(BaseModel):
    """
    Simplified, LLM-friendly alert format inspired by Prometheus Alertmanager.
    Deliberately flat β€” real Alertmanager nests {alertname, severity} under
    `labels` and {summary, description} under `annotations`, and uses RFC3339
    timestamps rather than simulation ticks.

    This flattening trades Alertmanager wire-compatibility for easier agent
    prompt construction. A shim in the episode loader could map this to the
    real schema if wire-compat were required.

    Reference: prometheus.io/docs/alerting/latest/configuration/ (webhook_config)
    """

    alert_id: str = Field(
        ..., description="Short UUID. e.g. 'a1b2c3d4'"
    )
    alertname: AlertName = Field(
        ..., description="Human-readable alert name."
    )
    service_name: str = Field(
        ..., description="Which service triggered the alert."
    )
    severity: AlertSeverity = Field(
        ..., description="Severity level."
    )
    description: str = Field(
        ...,
        description=(
            "Human-readable description. Format: "
            "'<metric> is <value> (threshold: <threshold>) on <service> for <n> ticks'"
        ),
    )
    fired_at_tick: int = Field(
        ..., description="Simulation tick when the threshold was crossed."
    )
    metric_name: str = Field(
        ..., description="The OTel metric name that breached threshold."
    )
    metric_value: float = Field(
        ..., description="Current value at time of firing."
    )
    threshold_value: float = Field(
        ..., description="The configured threshold that was crossed."
    )


# --------------------------------------------------------------------------
# SystemObservation β€” complete observable state
# --------------------------------------------------------------------------

class SystemObservation(_ObservationBase):
    """
    Complete observable state returned by reset(), step(), and state().
    The agent receives this after every action.

    Inherits from openenv Observation which provides:
      - done: bool (episode terminated)
      - reward: float | None (step reward)
      - metadata: dict (additional info dict)
    """

    services: dict[str, ServiceMetrics] = Field(
        default_factory=dict,
        description="Per-service metrics keyed by service_name. Subset of full topology.",
    )
    active_alerts: list[Alert] = Field(
        default_factory=list,
        description="Currently firing alerts. Auto-resolve when metric recovers.",
    )
    dependency_graph: dict[str, list[str]] = Field(
        default_factory=dict,
        description="Static topology for this episode. Does not change between ticks.",
    )
    slo_budget_remaining_pct: float = Field(
        default=100.0,
        description="Error budget %. Starts at 100.0, depletes per tick. 0.0 = episode over.",
    )
    bad_customer_minutes: float = Field(
        default=0.0,
        description="Cumulative user impact. Google SRE MTTM measurement.",
    )
    sim_time_elapsed_seconds: int = Field(
        default=0,
        description="Simulated seconds since episode start. 30s per tick.",
    )
    sim_tick: int = Field(
        default=0,
        description="Current tick number. Starts at 0 after reset().",
    )
    action_history: list[dict[str, str]] = Field(
        default_factory=list,
        description=(
            "Last 10 actions. Each entry: "
            "{action_type, target_service, feedback_string}."
        ),
    )
    incident_declared: bool = Field(
        default=False,
        description="True if agent called declare_resolved. Terminal condition.",
    )
    mttm_achieved_tick: int | None = Field(
        default=None,
        description="Tick when user impact first reached zero. None until achieved.",
    )
    user_impact_active: bool = Field(
        default=True,
        description=(
            "True if any user-facing service (api-gateway or checkout-service) "
            "has error_rate above the DEGRADED threshold. When False, SLO burn "
            "rate is reduced to 20% via the mitigation shield."
        ),
    )
    current_slo_burn_rate: float = Field(
        default=1.5,
        description=(
            "The SLO burn rate applied this tick. Equal to the difficulty's "
            "base burn rate when user_impact_active, or base Γ— 0.2 when mitigated."
        ),
    )
    episode_score: float | None = Field(
        default=None,
        description="Final grader score in (0.0, 1.0) exclusive. Set only when done=True.",
    )


# --------------------------------------------------------------------------
# FirewatchAction β€” agent command
# --------------------------------------------------------------------------

class FirewatchAction(_ActionBase):
    """
    Agent action. action_type is strictly validated against 10 allowed values.
    Unknown action_types are rejected with Pydantic ValidationError.
    The environment catches ValidationError and returns a graceful error response.

    Inherits from openenv Action which provides:
      - metadata: dict (additional action metadata)
    """

    action_type: ActionType = Field(
        ..., description="SRE command to execute."
    )
    target_service: str | None = Field(
        default=None,
        description="service_name to target. Required for all except declare_resolved/escalate.",
    )
    parameters: dict[str, object] = Field(
        default_factory=dict,
        description="Optional action params. e.g. {'memory_limit_mb': 1024} for scale_replicas.",
    )


# --------------------------------------------------------------------------
# ActionResult β€” structured action feedback
# --------------------------------------------------------------------------

class ActionResult(BaseModel):
    """
    Structured result of an agent action.
    Included in the info dict returned by every step() call.
    """

    valid: bool = Field(
        ..., description="Whether the action was valid and executed."
    )
    feedback: str = Field(
        ..., description="Human-readable feedback about what happened."
    )
    action_type: str = Field(
        default="", description="Echo of the action_type that was executed."
    )
    target_service: str | None = Field(
        default=None, description="Echo of the target_service."
    )


# --------------------------------------------------------------------------
# Status derivation utility
# --------------------------------------------------------------------------

def derive_status(
    error_rate: float,
    latency_p99: float,
    memory_utilization: float,
) -> ServiceStatus:
    """
    Compute service status from metric values.

    Applied in priority order: down β†’ critical β†’ degraded β†’ healthy.
    Thresholds trace to:
    - Error thresholds (0.10/0.50/0.90): canonical 99.9% SLO tier error budget
    - Latency thresholds (0.50s/2.0s): Prometheus default HTTP histogram buckets
    - Memory 0.98: Linux cgroup OOM territory (one tick before kernel kill)

    The simulation calls this after mutating metrics each tick to update
    the status field. It is NOT auto-computed on model access because the
    simulation needs explicit control over when status updates happen.
    """
    if (
        error_rate >= STATUS_THRESHOLD_DOWN_ERROR
        or memory_utilization >= STATUS_THRESHOLD_DOWN_MEMORY
    ):
        return "down"

    if (
        error_rate >= STATUS_THRESHOLD_CRITICAL_ERROR
        or latency_p99 >= STATUS_THRESHOLD_CRITICAL_LATENCY
    ):
        return "critical"

    if (
        error_rate >= STATUS_THRESHOLD_DEGRADED_ERROR
        or latency_p99 >= STATUS_THRESHOLD_DEGRADED_LATENCY
    ):
        return "degraded"

    return "healthy"


# --------------------------------------------------------------------------
# Public API
# --------------------------------------------------------------------------

__all__ = [
    "ServiceMetrics",
    "Alert",
    "SystemObservation",
    "FirewatchAction",
    "ActionResult",
    "ActionType",
    "AlertName",
    "AlertSeverity",
    "ServiceStatus",
    "derive_status",
]