Spaces:

Meta-HF-hackathon
/

sre-incident-simulator

Running

App Files Files Community

sre-incident-simulator / simulation /logs.py

Yaswanth-Bolla

Initial commit

1175c0b 4 days ago

raw

history blame contribute delete

7.34 kB

	"""
	Log stream generator.

	Produces realistic structured log entries — both signal and noise.
	Red herring logs are mixed in so the agent must filter real evidence
	from routine chatter.
	"""

	from __future__ import annotations

	import random
	from typing import Any, Dict, List, Optional


	# ------------------------------------------------------------------
	# Noise logs — routine operational chatter
	# ------------------------------------------------------------------

	_NOISE_TEMPLATES = [
	("INFO", "Processed {n} requests in last 60 seconds"),
	("INFO", "Health check passed — all dependencies reachable"),
	("INFO", "Connection pool stats: active={a}, idle={i}, max={m}"),
	("DEBUG", "Cache hit ratio: {r:.1%} — {h} hits, {m} misses"),
	("INFO", "Scheduled job 'metrics_export' completed in {d}ms"),
	("DEBUG", "TLS handshake completed with upstream in {d}ms"),
	("INFO", "Config reload: no changes detected"),
	("WARN", "Slow query detected: SELECT * FROM sessions took {d}ms"),
	("INFO", "Garbage collection: freed {n}MB in {d}ms"),
	("DEBUG", "Rate limiter: {n} requests allowed, 0 throttled"),
	]


	def generate_noise_logs(
	service_name: str,
	current_minute: int,
	count: int = 3,
	) -> List[Dict[str, Any]]:
	"""Generate routine noise logs for a service."""
	logs = []
	for _ in range(count):
	template_level, template_msg = random.choice(_NOISE_TEMPLATES)
	msg = template_msg.format(
	n=random.randint(100, 5000),
	a=random.randint(5, 20),
	i=random.randint(0, 10),
	m=random.randint(20, 50),
	r=random.uniform(0.85, 0.99),
	h=random.randint(1000, 9000),
	d=random.randint(1, 500),
	)
	logs.append({
	"timestamp": f"2025-01-15T14:{current_minute:02d}:{random.randint(0,59):02d}Z",
	"level": template_level,
	"service": service_name,
	"message": msg,
	"trace_id": None,
	})
	return logs


	# ------------------------------------------------------------------
	# Scenario-specific log generators (signal)
	# ------------------------------------------------------------------

	def generate_memory_leak_logs(
	service_name: str,
	current_minute: int,
	memory_percent: float,
	) -> List[Dict[str, Any]]:
	"""Logs that indicate a memory leak is in progress."""
	logs = []
	trace = f"trace-{random.randint(100000, 999999)}"

	if memory_percent > 90:
	logs.append(_log(current_minute, "FATAL", service_name,
	f"OutOfMemoryError: Java heap space — requested 256MB, "
	f"available 12MB", trace))
	logs.append(_log(current_minute, "ERROR", service_name,
	f"Container {service_name}-{random.randint(0,2)} killed by OOM killer "
	f"(exit code 137)", trace))
	elif memory_percent > 80:
	logs.append(_log(current_minute, "ERROR", service_name,
	f"Memory allocation failed: unable to allocate {random.randint(64, 256)}MB "
	f"for request processing", trace))
	logs.append(_log(current_minute, "WARN", service_name,
	f"GC overhead limit exceeded: spent {random.randint(80, 97)}% of time in GC"))
	elif memory_percent > 70:
	logs.append(_log(current_minute, "WARN", service_name,
	f"Heap usage warning: {memory_percent:.0f}% — approaching limit. "
	f"Consider increasing -Xmx or investigating leaks"))

	return logs


	def generate_auth_failure_logs(
	service_name: str,
	current_minute: int,
	is_auth_service: bool = False,
	) -> List[Dict[str, Any]]:
	"""Logs for auth-related failures (used in cascading failure scenario)."""
	logs = []
	trace = f"trace-{random.randint(100000, 999999)}"

	if is_auth_service:
	logs.append(_log(current_minute, "ERROR", service_name,
	"NullPointerException: configuration key 'auth.jwt.secret' is null "
	"— cannot validate tokens", trace))
	logs.append(_log(current_minute, "ERROR", service_name,
	f"Authentication failed for {random.randint(50, 200)} requests in "
	f"last 60s — returning HTTP 500"))
	else:
	logs.append(_log(current_minute, "ERROR", service_name,
	f"Call to auth-service failed: HTTP 500 Internal Server Error "
	f"— retrying ({random.randint(1,3)}/3)", trace))
	logs.append(_log(current_minute, "WARN", service_name,
	f"Circuit breaker for auth-service: state=HALF_OPEN, "
	f"failures={random.randint(5, 20)}, threshold=10"))

	return logs


	def generate_deadlock_logs(
	service_name: str,
	current_minute: int,
	waiting_on: str,
	) -> List[Dict[str, Any]]:
	"""Logs for distributed deadlock / circular wait."""
	logs = []
	trace = f"trace-{random.randint(100000, 999999)}"

	logs.append(_log(current_minute, "WARN", service_name,
	f"Request {trace} waiting on {waiting_on}: blocked for "
	f"{random.randint(5000, 25000)}ms — no response", trace))

	if random.random() < 0.4:
	logs.append(_log(current_minute, "ERROR", service_name,
	f"Timeout calling {waiting_on}: deadline exceeded after 30000ms. "
	f"Retry attempt {random.randint(3, 8)} of 10", trace))

	if random.random() < 0.2:
	logs.append(_log(current_minute, "ERROR", service_name,
	f"Thread pool exhausted: all {random.randint(50, 200)} threads blocked "
	f"waiting on downstream calls"))

	return logs


	# ------------------------------------------------------------------
	# Red herring logs — plausible but misleading
	# ------------------------------------------------------------------

	_RED_HERRING_TEMPLATES = [
	("WARN", "DNS resolution for {svc}.internal took {d}ms (threshold: 100ms)"),
	("WARN", "TLS certificate for {svc}.internal expires in {n} days"),
	("WARN", "Disk usage on /var/log: {n}% — consider log rotation"),
	("ERROR", "Failed to export metrics to Prometheus: connection timeout after {d}ms"),
	("WARN", "Background job 'cleanup_sessions' took {d}ms (expected <500ms)"),
	("ERROR", "Redis SLOWLOG: KEYS pattern='session:*' took {d}ms"),
	]


	def generate_red_herring_logs(
	service_name: str,
	current_minute: int,
	count: int = 1,
	) -> List[Dict[str, Any]]:
	"""Generate plausible but misleading log entries."""
	logs = []
	services = ["api_gateway", "auth", "orders", "payment", "cache", "database", "queue"]
	for _ in range(count):
	level, tmpl = random.choice(_RED_HERRING_TEMPLATES)
	msg = tmpl.format(
	svc=random.choice(services),
	d=random.randint(100, 3000),
	n=random.randint(3, 85),
	)
	logs.append(_log(current_minute, level, service_name, msg))
	return logs


	# ------------------------------------------------------------------
	# Helper
	# ------------------------------------------------------------------

	def _log(
	minute: int,
	level: str,
	service: str,
	message: str,
	trace_id: Optional[str] = None,
	) -> Dict[str, Any]:
	return {
	"timestamp": f"2025-01-15T14:{minute:02d}:{random.randint(0,59):02d}Z",
	"level": level,
	"service": service,
	"message": message,
	"trace_id": trace_id,
	}