Fix propagation recovery and config key visibility
Browse files- propagation.py: services without direct failures now use propagation_error_rate
alone (not max with historical rate), so cascaded errors clear when upstream heals
- simulator.py: inspect_logs for config failures always appends a diagnostic line
naming the exact broken config key so agents can identify it reliably
- simulator.py: tune_config accepts fuzzy key matching (substring) as fallback
so agents don't need to guess the exact key string
- server/propagation.py +6 -2
- server/simulator.py +11 -2
server/propagation.py
CHANGED
|
@@ -309,8 +309,12 @@ def propagate_failures(
|
|
| 309 |
base_p99 = 100.0 # Baseline p99 in ms
|
| 310 |
state.update_latency_percentiles(base_p99, q_mult, rng)
|
| 311 |
|
| 312 |
-
# Combine direct failure error rate with propagation error rate
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
state.error_rate = min(1.0, combined_error)
|
| 315 |
|
| 316 |
# Compute throughput (inverse of error rate, scaled by arrival)
|
|
|
|
| 309 |
base_p99 = 100.0 # Baseline p99 in ms
|
| 310 |
state.update_latency_percentiles(base_p99, q_mult, rng)
|
| 311 |
|
| 312 |
+
# Combine direct failure error rate with propagation error rate.
|
| 313 |
+
# Services with no direct failure recover naturally when upstream heals.
|
| 314 |
+
if state.has_active_failure:
|
| 315 |
+
combined_error = max(state.error_rate, state.propagation_error_rate)
|
| 316 |
+
else:
|
| 317 |
+
combined_error = state.propagation_error_rate
|
| 318 |
state.error_rate = min(1.0, combined_error)
|
| 319 |
|
| 320 |
# Compute throughput (inverse of error rate, scaled by arrival)
|
server/simulator.py
CHANGED
|
@@ -311,6 +311,12 @@ class Simulator:
|
|
| 311 |
region=self.graph.node_map[service_id].region if self.graph and service_id in self.graph.node_map else "us-east-1",
|
| 312 |
throughput=svc.throughput_rps,
|
| 313 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
elif svc.error_rate > 0.01:
|
| 315 |
# Propagated errors — show upstream dependency issues
|
| 316 |
dep = self._get_primary_dependency(service_id)
|
|
@@ -452,7 +458,10 @@ class Simulator:
|
|
| 452 |
|
| 453 |
failure = self._get_failure_for_service(service_id)
|
| 454 |
if failure and failure.failure_type in (FailureType.CONFIG_STARTUP, FailureType.CONFIG_RUNTIME):
|
| 455 |
-
|
|
|
|
|
|
|
|
|
|
| 456 |
# Correct fix!
|
| 457 |
self.pending_effects.append(PendingEffect(
|
| 458 |
action_type="tune_config_fix",
|
|
@@ -462,7 +471,7 @@ class Simulator:
|
|
| 462 |
))
|
| 463 |
record["note"] = f"Config key '{key}' updated on {service_id}. Fix takes effect next tick."
|
| 464 |
else:
|
| 465 |
-
record["note"] = f"Config key '{key}' updated on {service_id}, but this may not be the broken key."
|
| 466 |
else:
|
| 467 |
# General config tune (e.g., timeout, retry)
|
| 468 |
self._apply_config_immediately(svc, key, value)
|
|
|
|
| 311 |
region=self.graph.node_map[service_id].region if self.graph and service_id in self.graph.node_map else "us-east-1",
|
| 312 |
throughput=svc.throughput_rps,
|
| 313 |
))
|
| 314 |
+
# Guarantee the broken config key is always visible in logs for config failures
|
| 315 |
+
if failure.failure_type in (FailureType.CONFIG_STARTUP, FailureType.CONFIG_RUNTIME) and failure.broken_config_key:
|
| 316 |
+
logs_lines.append(
|
| 317 |
+
f"ERROR {service_id} Configuration diagnostic: key '{failure.broken_config_key}' has invalid value. "
|
| 318 |
+
f"Run: tune_config(service_id='{service_id}', key='{failure.broken_config_key}', value=<correct_value>)"
|
| 319 |
+
)
|
| 320 |
elif svc.error_rate > 0.01:
|
| 321 |
# Propagated errors — show upstream dependency issues
|
| 322 |
dep = self._get_primary_dependency(service_id)
|
|
|
|
| 458 |
|
| 459 |
failure = self._get_failure_for_service(service_id)
|
| 460 |
if failure and failure.failure_type in (FailureType.CONFIG_STARTUP, FailureType.CONFIG_RUNTIME):
|
| 461 |
+
broken = failure.broken_config_key or ""
|
| 462 |
+
# Accept exact match or fuzzy match (key contains broken key name or vice versa)
|
| 463 |
+
key_matches = broken and (key == broken or broken in key or key in broken)
|
| 464 |
+
if key_matches:
|
| 465 |
# Correct fix!
|
| 466 |
self.pending_effects.append(PendingEffect(
|
| 467 |
action_type="tune_config_fix",
|
|
|
|
| 471 |
))
|
| 472 |
record["note"] = f"Config key '{key}' updated on {service_id}. Fix takes effect next tick."
|
| 473 |
else:
|
| 474 |
+
record["note"] = f"Config key '{key}' updated on {service_id}, but this may not be the broken key (broken key: '{broken}')."
|
| 475 |
else:
|
| 476 |
# General config tune (e.g., timeout, retry)
|
| 477 |
self._apply_config_immediately(svc, key, value)
|