Spaces:
Sleeping
Sleeping
Improve environment depth: investigate action, cascading deps, rich observations, harder hard task
5f6895d | [ | |
| { | |
| "flag_name": "circuit_breaker_payments", | |
| "description": "Circuit breaker that disables payment processing when error rate exceeds 5%", | |
| "rollout_percentage": 0.0, | |
| "age_days": 30, | |
| "last_modified_days": 0, | |
| "owner": "payments-team", | |
| "owner_active": true, | |
| "num_code_references": 5, | |
| "has_dependencies": true, | |
| "dependent_flags": ["payment_fallback_mode"], | |
| "is_kill_switch": true, | |
| "has_active_incident": true, | |
| "usage_last_30d": 15000, | |
| "in_active_experiment": false, | |
| "services": ["payment-service", "checkout-service", "billing-service"], | |
| "code_snippet": "if circuit_breakers.is_tripped('circuit_breaker_payments'):\n metrics.increment('payments.circuit_breaker.tripped')\n raise PaymentServiceUnavailable('Circuit breaker active')", | |
| "last_commit_message": "hotfix: lower circuit breaker threshold from 10% to 5%", | |
| "pr_context": "PR #3400: Emergency circuit breaker adjustment β INC-5000 in progress, payment error rate at 4.8%", | |
| "related_incidents": ["INC-5000: Payment processing degradation (ACTIVE)", "INC-4800: Stripe API intermittent 503s (related)"], | |
| "investigation_notes": "ACTIVE INCIDENT INC-5000. Payment error rate currently at 4.8%, just below 5% threshold. Stripe investigating upstream issue. On-call team monitoring. DO NOT TOUCH.", | |
| "correct_action": "keep", | |
| "explanation": "Active kill switch during a live incident β absolutely must not be removed" | |
| }, | |
| { | |
| "flag_name": "payment_fallback_mode", | |
| "description": "Enables fallback to secondary payment processor when primary is down", | |
| "rollout_percentage": 1.0, | |
| "age_days": 60, | |
| "last_modified_days": 0, | |
| "owner": "payments-team", | |
| "owner_active": true, | |
| "num_code_references": 8, | |
| "has_dependencies": false, | |
| "dependent_flags": [], | |
| "is_kill_switch": false, | |
| "has_active_incident": true, | |
| "usage_last_30d": 5000, | |
| "in_active_experiment": false, | |
| "services": ["payment-service"], | |
| "code_snippet": "processor = fallback_processor if flags.get('payment_fallback_mode') else primary_processor\n# Fallback routes to Adyen when Stripe is down", | |
| "last_commit_message": "hotfix: enable payment fallback for INC-5000", | |
| "pr_context": "PR #3401: Enable fallback processor β activated during INC-5000 to maintain payment processing", | |
| "related_incidents": ["INC-5000: Payment processing degradation (ACTIVE)"], | |
| "investigation_notes": "Activated 2 hours ago for INC-5000. Currently processing 30% of payments through Adyen fallback. Removing this would cause immediate payment failures.", | |
| "correct_action": "keep", | |
| "explanation": "Actively used during current incident, depended on by circuit_breaker_payments" | |
| }, | |
| { | |
| "flag_name": "new_auth_middleware", | |
| "description": "Replaces session-token auth with JWT-based authentication", | |
| "rollout_percentage": 0.95, | |
| "age_days": 120, | |
| "last_modified_days": 30, | |
| "owner": "security-team", | |
| "owner_active": true, | |
| "num_code_references": 45, | |
| "has_dependencies": true, | |
| "dependent_flags": ["jwt_refresh_tokens", "auth_rate_limiting_v2", "legacy_session_cleanup"], | |
| "is_kill_switch": false, | |
| "has_active_incident": false, | |
| "usage_last_30d": 800000, | |
| "in_active_experiment": false, | |
| "services": ["auth-service", "api-gateway", "user-service", "admin-panel"], | |
| "code_snippet": "middleware = JWTAuthMiddleware(secret=config.jwt_secret) if flags.get('new_auth_middleware') else SessionAuthMiddleware(store=redis_client)\napp.add_middleware(middleware)", | |
| "last_commit_message": "security: patch JWT validation for CVE-2024-XXXXX", | |
| "pr_context": "PR #2000: Auth middleware migration β security-team. 3 dependent flags. 5% of users still on session auth (admin-panel legacy integration).", | |
| "related_incidents": ["INC-3800: JWT token validation bypass (patched)", "INC-3500: Session token leak in logs (triggered migration)"], | |
| "investigation_notes": "95% rolled out. The remaining 5% are admin-panel users on a legacy SSO integration that doesn't support JWT. Security-team has a migration plan but needs admin-panel team cooperation. 3 child flags form a dependency chain. Removing any one flag without the others will cause auth failures.", | |
| "correct_action": "escalate", | |
| "explanation": "95% rollout but 3 dependent flags across 4 services, 45 code refs β too risky for unilateral action, needs security team coordination" | |
| }, | |
| { | |
| "flag_name": "jwt_refresh_tokens", | |
| "description": "Enables refresh token rotation for JWT auth", | |
| "rollout_percentage": 0.95, | |
| "age_days": 100, | |
| "last_modified_days": 25, | |
| "owner": "security-team", | |
| "owner_active": true, | |
| "num_code_references": 12, | |
| "has_dependencies": true, | |
| "dependent_flags": ["auth_rate_limiting_v2"], | |
| "is_kill_switch": false, | |
| "has_active_incident": false, | |
| "usage_last_30d": 600000, | |
| "in_active_experiment": false, | |
| "services": ["auth-service", "api-gateway"], | |
| "code_snippet": "if flags.get('jwt_refresh_tokens'):\n token_pair = issue_rotated_tokens(refresh_token)\nelse:\n token_pair = issue_static_token(session_id)", | |
| "last_commit_message": "security: reduce refresh token TTL to 7 days", | |
| "pr_context": "PR #2050: JWT refresh tokens β depends on new_auth_middleware being enabled. Auth_rate_limiting_v2 depends on this for token-aware rate limiting.", | |
| "related_incidents": ["INC-3800: JWT token validation bypass (this flag was part of the fix)"], | |
| "investigation_notes": "Middle of a 3-flag dependency chain: new_auth_middleware -> jwt_refresh_tokens -> auth_rate_limiting_v2. Cannot be modified independently. Security-team must coordinate all 3.", | |
| "correct_action": "escalate", | |
| "explanation": "Part of auth middleware chain, depends on and is depended upon β must be coordinated" | |
| }, | |
| { | |
| "flag_name": "auth_rate_limiting_v2", | |
| "description": "New rate limiting algorithm that accounts for JWT token refresh patterns", | |
| "rollout_percentage": 0.8, | |
| "age_days": 75, | |
| "last_modified_days": 10, | |
| "owner": "security-team", | |
| "owner_active": true, | |
| "num_code_references": 8, | |
| "has_dependencies": false, | |
| "dependent_flags": [], | |
| "is_kill_switch": false, | |
| "has_active_incident": false, | |
| "usage_last_30d": 400000, | |
| "in_active_experiment": false, | |
| "services": ["api-gateway"], | |
| "code_snippet": "limiter = TokenAwareRateLimiter() if flags.get('auth_rate_limiting_v2') else FixedWindowLimiter()\nif not limiter.allow(request):\n return Response(status_code=429)", | |
| "last_commit_message": "fix: rate limiter false positives on token refresh burst", | |
| "pr_context": "PR #2100: Auth rate limiting v2 β tail of the auth migration chain. Depends on jwt_refresh_tokens being active.", | |
| "related_incidents": [], | |
| "investigation_notes": "Only 80% rolled out, recently had false positive issue fixed. Part of the auth chain still being deployed. Cannot go to 100% until jwt_refresh_tokens is at 100%.", | |
| "correct_action": "keep", | |
| "explanation": "Only 80% rolled out, recently modified, part of the auth chain still being deployed" | |
| }, | |
| { | |
| "flag_name": "legacy_session_cleanup", | |
| "description": "Background job that migrates old session tokens to new JWT format", | |
| "rollout_percentage": 1.0, | |
| "age_days": 90, | |
| "last_modified_days": 60, | |
| "owner": "security-team", | |
| "owner_active": true, | |
| "num_code_references": 3, | |
| "has_dependencies": false, | |
| "dependent_flags": [], | |
| "is_kill_switch": false, | |
| "has_active_incident": false, | |
| "usage_last_30d": 1000, | |
| "in_active_experiment": false, | |
| "services": ["auth-service"], | |
| "code_snippet": "# Runs as a daily cron job\nif flags.get('legacy_session_cleanup'):\n stale_sessions = redis.scan('session:*', count=1000)\n for s in stale_sessions:\n migrate_to_jwt(s)", | |
| "last_commit_message": "chore: reduce cleanup batch size to avoid Redis pressure", | |
| "pr_context": "PR #2080: Session cleanup migration β runs daily to convert old sessions. Part of new_auth_middleware migration.", | |
| "related_incidents": [], | |
| "investigation_notes": "Migration is 98% complete. Only 1000 sessions left (from 500K). Job will finish within a week. After completion, flag can be removed. But must verify with security-team first since it's part of the auth chain.", | |
| "correct_action": "deprecate", | |
| "explanation": "Fully rolled out, low usage suggests migration is nearly complete β deprecate and verify migration status before removal" | |
| }, | |
| { | |
| "flag_name": "experiment_pricing_tier_c", | |
| "description": "Shows pricing tier C in the A/B/C test for premium subscriptions", | |
| "rollout_percentage": 0.33, | |
| "age_days": 45, | |
| "last_modified_days": 3, | |
| "owner": "growth-team", | |
| "owner_active": true, | |
| "num_code_references": 7, | |
| "has_dependencies": true, | |
| "dependent_flags": ["experiment_pricing_tier_b"], | |
| "is_kill_switch": false, | |
| "has_active_incident": false, | |
| "usage_last_30d": 60000, | |
| "in_active_experiment": true, | |
| "services": ["pricing-service", "frontend-web", "analytics-service"], | |
| "code_snippet": "const pricingTier = flags.get('experiment_pricing_tier_c') ? TIER_C\n : flags.get('experiment_pricing_tier_b') ? TIER_B\n : TIER_A; // control group", | |
| "last_commit_message": "experiment: adjust tier C pricing from $29 to $24", | |
| "pr_context": "PR #3250: Pricing A/B/C test β growth-team testing 3 price points. Tiers B and C are variants, A is control. Removing either variant mid-experiment invalidates results.", | |
| "related_incidents": [], | |
| "investigation_notes": "Multi-variant experiment running for 3 weeks. Results expected in 2 more weeks. Tier C showing promising conversion. Removing any variant mid-experiment wastes $50K in ad spend.", | |
| "correct_action": "keep", | |
| "explanation": "Active multi-variant experiment, recently modified, with dependent flag" | |
| }, | |
| { | |
| "flag_name": "experiment_pricing_tier_b", | |
| "description": "Shows pricing tier B in the A/B/C test for premium subscriptions", | |
| "rollout_percentage": 0.33, | |
| "age_days": 45, | |
| "last_modified_days": 3, | |
| "owner": "growth-team", | |
| "owner_active": true, | |
| "num_code_references": 7, | |
| "has_dependencies": false, | |
| "dependent_flags": [], | |
| "is_kill_switch": false, | |
| "has_active_incident": false, | |
| "usage_last_30d": 60000, | |
| "in_active_experiment": true, | |
| "services": ["pricing-service", "frontend-web", "analytics-service"], | |
| "code_snippet": "// See experiment_pricing_tier_c for full logic\n// This flag gates the Tier B variant ($19/mo vs control $14/mo)", | |
| "last_commit_message": "experiment: adjust tier B pricing from $19 to $17", | |
| "pr_context": "PR #3251: Pricing tier B adjustment β paired with tier C experiment", | |
| "related_incidents": [], | |
| "investigation_notes": "Part of the same A/B/C experiment as tier C. Cannot be removed independently. Growth-team running final analysis.", | |
| "correct_action": "keep", | |
| "explanation": "Active experiment variant, same A/B/C test β must stay" | |
| }, | |
| { | |
| "flag_name": "deprecated_cache_layer", | |
| "description": "Routes reads through old Redis cluster instead of new Memcached layer", | |
| "rollout_percentage": 0.0, | |
| "age_days": 400, | |
| "last_modified_days": 350, | |
| "owner": "carlos.martinez", | |
| "owner_active": false, | |
| "num_code_references": 25, | |
| "has_dependencies": true, | |
| "dependent_flags": ["cache_warmup_job"], | |
| "is_kill_switch": false, | |
| "has_active_incident": false, | |
| "usage_last_30d": 0, | |
| "in_active_experiment": false, | |
| "services": ["cache-proxy", "backend-core", "search-service", "recommendation-service"], | |
| "code_snippet": "cache = RedisCluster(nodes=LEGACY_NODES) if flags.get('deprecated_cache_layer') else MemcachedClient(servers=NEW_SERVERS)\nresult = cache.get(cache_key)", | |
| "last_commit_message": "migrate: disable old Redis cache layer", | |
| "pr_context": "PR #1100: Cache migration β carlos.martinez (left company). 25 code references across 4 services. No cleanup plan documented.", | |
| "related_incidents": ["INC-1500: Redis cluster OOM (triggered migration)", "INC-1800: Cache inconsistency during migration (resolved)"], | |
| "investigation_notes": "carlos.martinez left 8 months ago. No handoff. Legacy Redis cluster is still running (costs $3K/month) but receives no traffic. 25 code refs span 4 services owned by 4 different teams. Need coordinated cleanup but no single owner.", | |
| "correct_action": "escalate", | |
| "explanation": "Disabled, inactive owner, but 25 refs across 4 services with a dependent flag β needs coordinated cleanup with multiple teams" | |
| }, | |
| { | |
| "flag_name": "cache_warmup_job", | |
| "description": "Pre-warms the cache layer on deployment with hot keys", | |
| "rollout_percentage": 1.0, | |
| "age_days": 380, | |
| "last_modified_days": 200, | |
| "owner": "carlos.martinez", | |
| "owner_active": false, | |
| "num_code_references": 4, | |
| "has_dependencies": false, | |
| "dependent_flags": [], | |
| "is_kill_switch": false, | |
| "has_active_incident": false, | |
| "usage_last_30d": 50, | |
| "in_active_experiment": false, | |
| "services": ["cache-proxy"], | |
| "code_snippet": "# Runs on deployment hook\nif flags.get('cache_warmup_job'):\n hot_keys = analytics.get_top_keys(limit=10000)\n cache.mset({k: db.get(k) for k in hot_keys})", | |
| "last_commit_message": "chore: reduce warmup batch to 10K keys", | |
| "pr_context": "PR #1120: Cache warmup β child of deprecated_cache_layer migration. carlos.martinez noted: 'warmup still useful for Memcached, review after migration complete'", | |
| "related_incidents": [], | |
| "investigation_notes": "Still runs on every deployment but only warms 50 keys (down from 10K). The warmup logic now targets Memcached (new layer) not Redis. Might still be useful but needs review by whoever owns cache-proxy now.", | |
| "correct_action": "deprecate", | |
| "explanation": "Fully rolled out, low usage, inactive owner, but tied to deprecated_cache_layer β deprecate and coordinate removal with parent" | |
| }, | |
| { | |
| "flag_name": "realtime_sync_engine", | |
| "description": "Enables WebSocket-based realtime data sync instead of polling", | |
| "rollout_percentage": 0.98, | |
| "age_days": 150, | |
| "last_modified_days": 90, | |
| "owner": "platform-team", | |
| "owner_active": true, | |
| "num_code_references": 35, | |
| "has_dependencies": true, | |
| "dependent_flags": ["ws_connection_pooling", "sync_conflict_resolver"], | |
| "is_kill_switch": false, | |
| "has_active_incident": false, | |
| "usage_last_30d": 2000000, | |
| "in_active_experiment": false, | |
| "services": ["sync-service", "frontend-web", "mobile-api", "ws-gateway", "conflict-resolver"], | |
| "code_snippet": "const transport = flags.get('realtime_sync_engine')\n ? new WebSocketTransport(wsGateway)\n : new PollingTransport({interval: 5000}); // legacy 5s polling\n\n// WARNING: 2% of users on legacy polling are enterprise clients\n// with firewall rules blocking WebSocket connections", | |
| "last_commit_message": "fix: WebSocket reconnection logic for flaky mobile networks", | |
| "pr_context": "PR #2200: Realtime sync migration β 2% of enterprise clients can't use WebSocket due to corporate firewalls. Platform-team working with enterprise team on proxy solution. 35 code refs across 5 services.", | |
| "related_incidents": ["INC-3000: WebSocket connection storm during deploy (resolved)", "INC-3300: Sync conflict data loss on reconnect (resolved)", "INC-3600: Mobile app crash on WebSocket timeout (resolved)"], | |
| "investigation_notes": "3 past incidents. 2% holdout is intentional β enterprise clients behind firewalls. 35 code references across 5 services. 2 dependent flags. Enterprise proxy solution ETA is Q3. Removing flag would break 2% of enterprise users (high-value accounts).", | |
| "correct_action": "escalate", | |
| "explanation": "98% rolled out but 2% holdout is intentional for enterprise. 3 incidents, 35 code refs, 5 services, 2 deps β extremely complex, needs platform-team + enterprise-team coordination" | |
| }, | |
| { | |
| "flag_name": "ws_connection_pooling", | |
| "description": "Connection pooling for WebSocket gateway to reduce memory usage", | |
| "rollout_percentage": 1.0, | |
| "age_days": 120, | |
| "last_modified_days": 45, | |
| "owner": "platform-team", | |
| "owner_active": true, | |
| "num_code_references": 6, | |
| "has_dependencies": false, | |
| "dependent_flags": [], | |
| "is_kill_switch": false, | |
| "has_active_incident": false, | |
| "usage_last_30d": 1800000, | |
| "in_active_experiment": false, | |
| "services": ["ws-gateway"], | |
| "code_snippet": "pool = ConnectionPool(max_size=10000) if flags.get('ws_connection_pooling') else UnpooledConnections()\nws_server = WebSocketServer(connection_manager=pool)", | |
| "last_commit_message": "perf: tune connection pool max size for peak traffic", | |
| "pr_context": "PR #2250: WS connection pooling β performance optimization. Depends on realtime_sync_engine being active.", | |
| "related_incidents": [], | |
| "investigation_notes": "Fully rolled out and stable. BUT: this is a child of realtime_sync_engine. If the parent flag is ever rolled back (e.g., during incident), this flag becomes meaningless. Should be cleaned up together with parent, not independently.", | |
| "correct_action": "keep", | |
| "explanation": "Fully rolled out but parent flag (realtime_sync_engine) is still at 98% and under active management β keep until parent is resolved" | |
| }, | |
| { | |
| "flag_name": "sync_conflict_resolver", | |
| "description": "Automatic conflict resolution for concurrent edits via operational transform", | |
| "rollout_percentage": 0.7, | |
| "age_days": 80, | |
| "last_modified_days": 5, | |
| "owner": "platform-team", | |
| "owner_active": true, | |
| "num_code_references": 18, | |
| "has_dependencies": false, | |
| "dependent_flags": [], | |
| "is_kill_switch": false, | |
| "has_active_incident": false, | |
| "usage_last_30d": 500000, | |
| "in_active_experiment": false, | |
| "services": ["conflict-resolver", "sync-service", "frontend-web"], | |
| "code_snippet": "if flags.get('sync_conflict_resolver'):\n resolved = ot_transform(local_ops, remote_ops)\nelse:\n resolved = last_write_wins(local_doc, remote_doc) # data loss risk!", | |
| "last_commit_message": "fix: OT edge case with concurrent delete+insert", | |
| "pr_context": "PR #3350: Conflict resolver at 70% β actively being developed. Last-write-wins fallback has known data loss risk. Team pushing to 100% ASAP.", | |
| "related_incidents": ["INC-3300: Sync conflict data loss on reconnect (this flag was created to fix it)"], | |
| "investigation_notes": "Only 70% rolled out. Active development β commit 5 days ago fixing edge cases. The fallback (last-write-wins) causes data loss. Team is pushing to 100% but found a new edge case. Cannot remove or deprecate β must keep and monitor.", | |
| "correct_action": "keep", | |
| "explanation": "70% rollout, active development, fallback has data loss risk β must keep" | |
| } | |
| ] | |