File size: 15,274 Bytes
a46811c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524b287
 
 
 
 
 
 
 
 
 
 
a46811c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
"""
server/logs.py — Framework-specific log message templates per failure type.

Each failure type has 5-10 realistic log templates drawn from real frameworks:
Spring Boot, Node.js, FastAPI, Kubernetes, HikariCP, Redis, gRPC.

Templates use placeholders {service}, {dependency}, {value} etc. that are
filled at runtime with actual service/metric values.

Sources: Docs/DataResearch.md Answer 4 + Answer 11.
"""

from __future__ import annotations

import random
from typing import Dict, List, Optional

from server.failures import FailureType


# ---------------------------------------------------------------------------
# Log templates per failure type
# ---------------------------------------------------------------------------

_TEMPLATES: Dict[FailureType, List[str]] = {
    FailureType.CRASH: [
        "ERROR {service} OOMKilled: container exceeded memory limit ({memory_limit}Mi). Exit code 137. Pod restarting (backoff: {backoff}s)",
        "FATAL {service} Process exited with signal 9 (SIGKILL). Out of memory. Restart count: {restart_count}",
        "ERROR {service} CrashLoopBackOff: back-off restarting failed container. Last exit: OOMKilled",
        "CRIT {service} JVM heap exhausted: java.lang.OutOfMemoryError: Java heap space. Heap: {heap_used}Mi/{heap_max}Mi",
        "ERROR {service} Panic: runtime error: out of memory. goroutine stack overflow at allocateHeap()",
        "FATAL {service} Node process crashed: FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory",
    ],

    FailureType.BAD_DEPLOY: [
        "ERROR {service} {version} NullPointerException: Cannot invoke \"{method}\" on null reference at {class}.process({class}.java:{line})",
        "ERROR {service} {version} TypeError: Cannot read properties of undefined (reading '{property}'). Stack: at {handler} ({file}:{line})",
        "ERROR {service} {version} Traceback (most recent call last):\\n  File \"{file}\", line {line}\\n    {code_line}\\nAttributeError: '{class}' object has no attribute '{attribute}'",
        "ERROR {service} {version} panic: interface conversion: interface {} is nil, not *{type}. goroutine {goroutine_id} [running]",
        "ERROR {service} {version} Unhandled rejection: ValidationError: \"{field}\" is required. Schema version mismatch between {version} and data format.",
        "WARN  {service} {version} Health check failing: /health returned 500. Error rate climbing: {error_rate}%",
    ],

    FailureType.CONFIG_STARTUP: [
        "FATAL {service} password authentication failed for user \"{db_user}\" on {dependency}:{port}. Connection refused.",
        "ERROR {service} Could not resolve placeholder '{config_key}' in value \"${{{config_key}}}\"",
        "FATAL {service} Configuration error: required key [{config_key}] not found in application.yml",
        "ERROR {service} Failed to bind to port {port}: EADDRINUSE. Another process is using this port.",
        "FATAL {service} SSL/TLS certificate error: certificate has expired. CN={dependency}. Valid until: {expiry}",
        "ERROR {service} Cannot connect to {dependency}: Connection refused. Retried {retry_count} times, giving up.",
    ],

    FailureType.CONFIG_RUNTIME: [
        "ERROR {service} Request to https://{config_value}/charge failed: ECONNREFUSED. Feature \"{feature_flag}\" enabled but endpoint misconfigured.",
        "WARN  {service} Fallback triggered for {dependency}: timeout after {timeout_ms}ms. Config key '{config_key}' may be incorrect.",
        "ERROR {service} Invalid JSON response from {dependency}: Unexpected token '<' at position 0. Endpoint returning HTML instead of API response.",
        "ERROR {service} Feature flag '{feature_flag}' enabled new code path but dependency '{dependency}' not configured. Returning 500 for {error_rate}% of /api/v2 requests.",
        "WARN  {service} Rate limit config mismatch: max_rps={config_value} but actual traffic is {throughput}rps. Dropping {error_rate}% of requests.",
    ],

    FailureType.CASCADING_LATENCY: [
        "WARN  {service} Thread pool self-saturation: {active}/{pool_size} worker threads active. Queue depth: {queue_depth}. Avg wait: {wait_ms}ms. "
        "This service is the bottleneck — scale or rebalance traffic away from this service.",
        "WARN  {service} Worker thread exhaustion: arrival rate {throughput}rps exceeds processing capacity. "
        "Active threads: {active}/{pool_size}. Queued: {queue_depth}. Fix: scale_service or rebalance_traffic.",
        "ERROR {service} Request queue overflow: {queue_depth} requests waiting for worker threads ({active}/{pool_size} busy). "
        "p99={p99_ms}ms. Root cause is this service's own capacity — restart to clear threads or scale to add capacity.",
        "WARN  {service} Internal latency spiral: p99={p99_ms}ms (baseline: {baseline_ms}ms). Thread pool utilisation critical. "
        "Retry amplification causing {throughput}rps effective load. This service needs to be restarted or scaled.",
        "CRIT  {service} Capacity overload: {active}/{pool_size} threads saturated, {queue_depth} requests pending. "
        "All downstream timeouts are a symptom of THIS service being overwhelmed. "
        "Run: restart_service or scale_service on {service}.",
    ],

    FailureType.RESOURCE_LEAK: [
        "WARN  {service} Memory usage {memory_pct}% ({memory_used}Mi/{memory_limit}Mi). GC overhead {gc_pct}%. Last full GC: {gc_pause}s pause. Allocation failure imminent.",
        "WARN  {service} File descriptor leak detected: open_fds={open_fds} (limit: {fd_limit}). Growing at {fd_rate}/min.",
        "WARN  {service} Goroutine leak: count={goroutine_count} (baseline: {baseline}). Growing linearly. Stack trace: {leak_source}",
        "ERROR {service} GC overhead limit exceeded: spending {gc_pct}% of time in GC. Heap: {memory_used}Mi/{memory_limit}Mi.",
        "WARN  {service} Connection leak to {dependency}: {active} connections checked out but not returned. Pool: {active}/{pool_size}.",
    ],

    FailureType.DB_DEGRADATION: [
        "ERROR {service} HikariPool-1 connection not available, request timed out after {timeout_ms}ms. Active: {active}/{pool_size}, Waiting: {waiting}.",
        "WARN  {service} Slow query detected: SELECT * FROM {table} WHERE ... took {query_ms}ms (threshold: {threshold_ms}ms). Lock contention on {table}.",
        "ERROR {service} Connection pool exhausted for {dependency}. Active: {active}/{pool_size}. Oldest connection age: {age_ms}ms.",
        "WARN  {service} Database replication lag: {lag_ms}ms on {dependency}. Read-after-write consistency violated.",
        "ERROR {service} Deadlock detected on {dependency}: Transaction {tx_id} waiting for lock held by {blocking_tx}. Auto-rolling back.",
        "WARN  {service} {dependency} CPU={db_cpu}% but app CPU={app_cpu}% (paradoxically low). Threads blocked on I/O wait.",
    ],

    FailureType.CACHE_FAILURE: [
        "WARN  {service} CLUSTERDOWN: {dependency} cluster is down. Hit rate dropped from {baseline_hit_rate}% to 0%. Backend QPS spiked {spike_factor}x.",
        "ERROR {service} Redis connection lost: {dependency} ECONNRESET. Failover in progress. Cache miss rate: 100%.",
        "WARN  {service} Cache stampede detected: {concurrent_misses} concurrent cache misses for key pattern '{key_pattern}'. Backend overloaded.",
        "ERROR {service} {dependency} READONLY: Redis replica cannot accept writes. Cluster rebalancing.",
        "WARN  {service} Cache eviction storm: {evicted} keys evicted in last {interval}s. Memory pressure on {dependency}.",
    ],

    FailureType.NETWORK_ERROR: [
        "ERROR {service} DNS resolution failed for {dependency}.{region}.internal: NXDOMAIN. 0/{endpoint_count} endpoints reachable.",
        "ERROR {service} TCP connection to {dependency}:{port} failed: ETIMEDOUT after {timeout_ms}ms. Network partition suspected.",
        "ERROR {service} TLS handshake failed with {dependency}: certificate verify failed (depth 0). CN mismatch or expired cert.",
        "CRIT  {service} All endpoints for {dependency} unreachable in region {region}. Last successful connection: {last_success} ago.",
        "ERROR {service} gRPC transport error: UNAVAILABLE: {dependency} DNS resolution failed for \"{dependency}.svc.cluster.local\"",
    ],
}


# ---------------------------------------------------------------------------
# Placeholder value generators
# ---------------------------------------------------------------------------


def _random_class_name(rng: random.Random) -> str:
    prefixes = ["Payment", "Order", "Auth", "Inventory", "Cart", "Billing", "Shipping"]
    suffixes = ["Service", "Handler", "Controller", "Processor", "Manager"]
    return rng.choice(prefixes) + rng.choice(suffixes)


def _random_method(rng: random.Random) -> str:
    return rng.choice(["process", "handle", "execute", "validate", "transform", "serialize", "getId", "getStatus"])


def _random_property(rng: random.Random) -> str:
    return rng.choice(["id", "status", "amount", "userId", "orderId", "timestamp", "payload", "response"])


def _fill_placeholders(
    template: str,
    service_id: str,
    rng: random.Random,
    dependency: str = "unknown",
    error_rate: float = 0.0,
    memory_pct: float = 50.0,
    p99_ms: float = 100.0,
    pool_pct: float = 10.0,
    version: str = "v1.0.0",
    config_key: str = "db_host",
    config_value: str = "wrong-endpoint.internal",
    region: str = "us-east-1",
    throughput: float = 100.0,
) -> str:
    """Fill placeholders in a log template with realistic values."""
    replacements = {
        "service": service_id,
        "dependency": dependency,
        "version": version,
        "error_rate": f"{error_rate * 100:.0f}",
        "memory_pct": f"{memory_pct:.0f}",
        "memory_used": f"{int(memory_pct * 20.48):.0f}",
        "memory_limit": "2048",
        "heap_used": f"{int(memory_pct * 10.24):.0f}",
        "heap_max": "1024",
        "p99_ms": f"{p99_ms:.0f}",
        "baseline_ms": f"{rng.randint(20, 80)}",
        "timeout_ms": f"{rng.choice([3000, 5000, 10000, 30000])}",
        "cooldown": f"{rng.randint(15, 60)}",
        "queued": f"{rng.randint(50, 500)}",
        "queue_depth": f"{rng.randint(100, 1000)}",
        "wait_ms": f"{rng.randint(500, 5000)}",
        "active": f"{rng.randint(15, 25)}",
        "pool_size": "20",
        "pending": f"{rng.randint(50, 200)}",
        "checkout_ms": f"{rng.randint(1000, 10000)}",
        "threshold_ms": "1000",
        "retry_count": f"{rng.randint(1, 5)}",
        "retry_max": "3",
        "backoff": f"{rng.choice([10, 15, 30, 60])}",
        "restart_count": f"{rng.randint(3, 15)}",
        "port": f"{rng.choice([5432, 6379, 8080, 9090, 3000])}",
        "db_user": rng.choice(["app_user", "service_account", "auth_user", "readonly"]),
        "config_key": config_key,
        "config_value": config_value,
        "feature_flag": rng.choice(["new_checkout_flow", "v2_api", "experimental_search", "dynamic_pricing"]),
        "region": region,
        "endpoint_count": f"{rng.randint(2, 5)}",
        "class": _random_class_name(rng),
        "method": _random_method(rng),
        "property": _random_property(rng),
        "attribute": _random_property(rng),
        "type": _random_class_name(rng),
        "handler": rng.choice(["processRequest", "handleEvent", "onMessage"]),
        "file": rng.choice(["app.py", "handler.js", "service.go", "controller.java"]),
        "line": f"{rng.randint(42, 350)}",
        "code_line": rng.choice(["result = response.data['items']", "return self.client.process(payload)"]),
        "field": rng.choice(["amount", "currency", "userId", "orderId"]),
        "goroutine_id": f"{rng.randint(100, 999)}",
        "table": rng.choice(["orders", "payments", "users", "inventory", "sessions"]),
        "query_ms": f"{rng.randint(5000, 30000)}",
        "tx_id": f"tx-{rng.randint(1000, 9999)}",
        "blocking_tx": f"tx-{rng.randint(1000, 9999)}",
        "lag_ms": f"{rng.randint(1000, 10000)}",
        "age_ms": f"{rng.randint(30000, 120000)}",
        "db_cpu": f"{rng.randint(5, 25)}",
        "app_cpu": f"{rng.randint(2, 15)}",
        "waiting": f"{rng.randint(50, 300)}",
        "baseline_hit_rate": f"{rng.uniform(95.0, 99.5):.1f}",
        "spike_factor": f"{rng.randint(10, 50)}",
        "concurrent_misses": f"{rng.randint(100, 1000)}",
        "key_pattern": rng.choice(["user:*", "product:*:price", "session:*", "inventory:*"]),
        "evicted": f"{rng.randint(10000, 100000)}",
        "interval": f"{rng.randint(10, 60)}",
        "gc_pct": f"{rng.randint(30, 70)}",
        "gc_pause": f"{rng.uniform(0.5, 3.0):.1f}",
        "open_fds": f"{rng.randint(800, 1024)}",
        "fd_limit": "1024",
        "fd_rate": f"{rng.randint(5, 20)}",
        "goroutine_count": f"{rng.randint(5000, 50000)}",
        "baseline": f"{rng.randint(50, 200)}",
        "leak_source": rng.choice(["http.ListenAndServe", "grpc.NewServer", "sql.Open"]),
        "hop_count": f"{rng.randint(2, 5)}",
        "remaining_ms": f"{rng.randint(-500, 10)}",
        "last_success": rng.choice(["45s", "2m30s", "5m12s"]),
        "throughput": f"{throughput:.0f}",
    }

    result = template
    for key, value in replacements.items():
        result = result.replace("{" + key + "}", str(value))
    return result


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def generate_log_message(
    failure_type: FailureType,
    service_id: str,
    rng: random.Random,
    dependency: str = "unknown",
    error_rate: float = 0.0,
    memory_pct: float = 50.0,
    p99_ms: float = 100.0,
    pool_pct: float = 10.0,
    version: str = "v1.0.0",
    config_key: str = "db_host",
    config_value: str = "wrong-endpoint.internal",
    region: str = "us-east-1",
    throughput: float = 100.0,
) -> str:
    """Generate a realistic log message for the given failure type and service."""
    templates = _TEMPLATES.get(failure_type, [])
    if not templates:
        return f"ERROR {service_id} Unknown failure condition detected."

    template = rng.choice(templates)
    return _fill_placeholders(
        template, service_id, rng,
        dependency=dependency,
        error_rate=error_rate,
        memory_pct=memory_pct,
        p99_ms=p99_ms,
        pool_pct=pool_pct,
        version=version,
        config_key=config_key,
        config_value=config_value,
        region=region,
        throughput=throughput,
    )


def generate_healthy_log(service_id: str, rng: random.Random) -> str:
    """Generate a log message for a healthy service being inspected."""
    templates = [
        f"INFO  {service_id} Health check passed. Status: UP. Response time: {rng.randint(2, 15)}ms.",
        f"INFO  {service_id} All endpoints healthy. Error rate: 0.0%. p99: {rng.randint(10, 50)}ms.",
        f"DEBUG {service_id} Metrics nominal. CPU: {rng.randint(5, 25)}%, Memory: {rng.randint(20, 45)}%, Connections: {rng.randint(2, 10)}/20.",
        f"INFO  {service_id} No anomalies detected in last 60s. request_count={rng.randint(500, 2000)}, error_count=0.",
    ]
    return rng.choice(templates)