Mist-ic commited on
Commit
ffe4aa5
·
1 Parent(s): ec93d50

Fix propagation recovery and config key visibility

Browse files

- propagation.py: services without direct failures now use propagation_error_rate
alone (not max with historical rate), so cascaded errors clear when upstream heals
- simulator.py: inspect_logs for config failures always appends a diagnostic line
naming the exact broken config key so agents can identify it reliably
- simulator.py: tune_config accepts fuzzy key matching (substring) as fallback
so agents don't need to guess the exact key string

Files changed (2) hide show
  1. server/propagation.py +6 -2
  2. server/simulator.py +11 -2
server/propagation.py CHANGED
@@ -309,8 +309,12 @@ def propagate_failures(
309
  base_p99 = 100.0 # Baseline p99 in ms
310
  state.update_latency_percentiles(base_p99, q_mult, rng)
311
 
312
- # Combine direct failure error rate with propagation error rate
313
- combined_error = max(state.error_rate, state.propagation_error_rate)
 
 
 
 
314
  state.error_rate = min(1.0, combined_error)
315
 
316
  # Compute throughput (inverse of error rate, scaled by arrival)
 
309
  base_p99 = 100.0 # Baseline p99 in ms
310
  state.update_latency_percentiles(base_p99, q_mult, rng)
311
 
312
+ # Combine direct failure error rate with propagation error rate.
313
+ # Services with no direct failure recover naturally when upstream heals.
314
+ if state.has_active_failure:
315
+ combined_error = max(state.error_rate, state.propagation_error_rate)
316
+ else:
317
+ combined_error = state.propagation_error_rate
318
  state.error_rate = min(1.0, combined_error)
319
 
320
  # Compute throughput (inverse of error rate, scaled by arrival)
server/simulator.py CHANGED
@@ -311,6 +311,12 @@ class Simulator:
311
  region=self.graph.node_map[service_id].region if self.graph and service_id in self.graph.node_map else "us-east-1",
312
  throughput=svc.throughput_rps,
313
  ))
 
 
 
 
 
 
314
  elif svc.error_rate > 0.01:
315
  # Propagated errors — show upstream dependency issues
316
  dep = self._get_primary_dependency(service_id)
@@ -452,7 +458,10 @@ class Simulator:
452
 
453
  failure = self._get_failure_for_service(service_id)
454
  if failure and failure.failure_type in (FailureType.CONFIG_STARTUP, FailureType.CONFIG_RUNTIME):
455
- if key == failure.broken_config_key:
 
 
 
456
  # Correct fix!
457
  self.pending_effects.append(PendingEffect(
458
  action_type="tune_config_fix",
@@ -462,7 +471,7 @@ class Simulator:
462
  ))
463
  record["note"] = f"Config key '{key}' updated on {service_id}. Fix takes effect next tick."
464
  else:
465
- record["note"] = f"Config key '{key}' updated on {service_id}, but this may not be the broken key."
466
  else:
467
  # General config tune (e.g., timeout, retry)
468
  self._apply_config_immediately(svc, key, value)
 
311
  region=self.graph.node_map[service_id].region if self.graph and service_id in self.graph.node_map else "us-east-1",
312
  throughput=svc.throughput_rps,
313
  ))
314
+ # Guarantee the broken config key is always visible in logs for config failures
315
+ if failure.failure_type in (FailureType.CONFIG_STARTUP, FailureType.CONFIG_RUNTIME) and failure.broken_config_key:
316
+ logs_lines.append(
317
+ f"ERROR {service_id} Configuration diagnostic: key '{failure.broken_config_key}' has invalid value. "
318
+ f"Run: tune_config(service_id='{service_id}', key='{failure.broken_config_key}', value=<correct_value>)"
319
+ )
320
  elif svc.error_rate > 0.01:
321
  # Propagated errors — show upstream dependency issues
322
  dep = self._get_primary_dependency(service_id)
 
458
 
459
  failure = self._get_failure_for_service(service_id)
460
  if failure and failure.failure_type in (FailureType.CONFIG_STARTUP, FailureType.CONFIG_RUNTIME):
461
+ broken = failure.broken_config_key or ""
462
+ # Accept exact match or fuzzy match (key contains broken key name or vice versa)
463
+ key_matches = broken and (key == broken or broken in key or key in broken)
464
+ if key_matches:
465
  # Correct fix!
466
  self.pending_effects.append(PendingEffect(
467
  action_type="tune_config_fix",
 
471
  ))
472
  record["note"] = f"Config key '{key}' updated on {service_id}. Fix takes effect next tick."
473
  else:
474
+ record["note"] = f"Config key '{key}' updated on {service_id}, but this may not be the broken key (broken key: '{broken}')."
475
  else:
476
  # General config tune (e.g., timeout, retry)
477
  self._apply_config_immediately(svc, key, value)