| """Tests for #1630 — gateway infinite 400 failure loop prevention. |
| |
| Verifies that: |
| 1. Generic 400 errors with large sessions are treated as context-length errors |
| and trigger compression instead of aborting. |
| 2. The gateway does not persist messages when the agent fails early, preventing |
| the session from growing on each failure. |
| 3. Context-overflow failures produce helpful error messages suggesting /compact. |
| """ |
|
|
| import pytest |
| from types import SimpleNamespace |
| from unittest.mock import MagicMock, patch |
|
|
|
|
| |
| |
| |
|
|
|
|
| class TestGeneric400Heuristic: |
| """The agent should treat a generic 400 with a large session as a |
| probable context-length error and trigger compression, not abort.""" |
|
|
| def _make_agent(self): |
| """Create a minimal AIAgent for testing error handling.""" |
| with ( |
| patch("run_agent.get_tool_definitions", return_value=[]), |
| patch("run_agent.check_toolset_requirements", return_value={}), |
| patch("run_agent.OpenAI"), |
| ): |
| from run_agent import AIAgent |
| a = AIAgent( |
| api_key="test-key-12345", |
| quiet_mode=True, |
| skip_context_files=True, |
| skip_memory=True, |
| ) |
| a.client = MagicMock() |
| a._cached_system_prompt = "You are helpful." |
| a._use_prompt_caching = False |
| a.tool_delay = 0 |
| a.compression_enabled = False |
| return a |
|
|
| def test_generic_400_with_small_session_is_client_error(self): |
| """A generic 400 with a small session should still be treated |
| as a non-retryable client error (not context overflow).""" |
| error_msg = "error" |
| status_code = 400 |
| approx_tokens = 1000 |
| api_messages = [{"role": "user", "content": "hi"}] |
|
|
| |
| is_context_length_error = any(phrase in error_msg for phrase in [ |
| 'context length', 'context size', 'maximum context', |
| 'token limit', 'too many tokens', 'reduce the length', |
| 'exceeds the limit', 'context window', |
| 'request entity too large', |
| 'prompt is too long', |
| ]) |
| assert not is_context_length_error |
|
|
| |
| ctx_len = 200000 |
| is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80 |
| is_generic_error = len(error_msg.strip()) < 30 |
| assert not is_large_session |
|
|
| def test_generic_400_with_large_token_count_triggers_heuristic(self): |
| """A generic 400 with high token count should be treated as |
| probable context overflow.""" |
| error_msg = "error" |
| status_code = 400 |
| ctx_len = 200000 |
| approx_tokens = 100000 |
| api_messages = [{"role": "user", "content": "hi"}] * 20 |
|
|
| is_context_length_error = any(phrase in error_msg for phrase in [ |
| 'context length', 'context size', 'maximum context', |
| ]) |
| assert not is_context_length_error |
|
|
| |
| is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80 |
| is_generic_error = len(error_msg.strip()) < 30 |
| assert is_large_session |
| assert is_generic_error |
| |
|
|
| def test_generic_400_with_many_messages_triggers_heuristic(self): |
| """A generic 400 with >80 messages should trigger the heuristic |
| even if estimated tokens are low.""" |
| error_msg = "error" |
| status_code = 400 |
| ctx_len = 200000 |
| approx_tokens = 5000 |
| api_messages = [{"role": "user", "content": "x"}] * 100 |
|
|
| is_large_session = approx_tokens > ctx_len * 0.4 or len(api_messages) > 80 |
| is_generic_error = len(error_msg.strip()) < 30 |
| assert is_large_session |
| assert is_generic_error |
|
|
| def test_specific_error_message_bypasses_heuristic(self): |
| """A 400 with a specific, long error message should NOT trigger |
| the heuristic even with a large session.""" |
| error_msg = "invalid model: anthropic/claude-nonexistent-model is not available" |
| status_code = 400 |
| ctx_len = 200000 |
| approx_tokens = 100000 |
|
|
| is_generic_error = len(error_msg.strip()) < 30 |
| assert not is_generic_error |
|
|
| def test_descriptive_context_error_caught_by_phrases(self): |
| """Descriptive context-length errors should still be caught by |
| the existing phrase matching (not the heuristic).""" |
| error_msg = "prompt is too long: 250000 tokens > 200000 maximum" |
| is_context_length_error = any(phrase in error_msg for phrase in [ |
| 'context length', 'context size', 'maximum context', |
| 'token limit', 'too many tokens', 'reduce the length', |
| 'exceeds the limit', 'context window', |
| 'request entity too large', |
| 'prompt is too long', |
| ]) |
| assert is_context_length_error |
|
|
|
|
| |
| |
| |
|
|
| class TestGatewaySkipsPersistenceOnFailure: |
| """When the agent returns failed=True with no final_response, |
| the gateway should NOT persist messages to the transcript.""" |
|
|
| def test_agent_failed_early_detected(self): |
| """The agent_failed_early flag is True when failed=True and |
| no final_response.""" |
| agent_result = { |
| "failed": True, |
| "final_response": None, |
| "messages": [], |
| "error": "Non-retryable client error", |
| } |
| agent_failed_early = ( |
| agent_result.get("failed") |
| and not agent_result.get("final_response") |
| ) |
| assert agent_failed_early |
|
|
| def test_agent_with_response_not_failed_early(self): |
| """When the agent has a final_response, it's not a failed-early |
| scenario even if failed=True.""" |
| agent_result = { |
| "failed": True, |
| "final_response": "Here is a partial response", |
| "messages": [], |
| } |
| agent_failed_early = ( |
| agent_result.get("failed") |
| and not agent_result.get("final_response") |
| ) |
| assert not agent_failed_early |
|
|
| def test_successful_agent_not_failed_early(self): |
| """A successful agent result should not trigger skip.""" |
| agent_result = { |
| "final_response": "Hello!", |
| "messages": [{"role": "assistant", "content": "Hello!"}], |
| } |
| agent_failed_early = ( |
| agent_result.get("failed") |
| and not agent_result.get("final_response") |
| ) |
| assert not agent_failed_early |
|
|
|
|
| |
| |
| |
|
|
| class TestContextOverflowErrorMessages: |
| """The gateway should produce helpful error messages when the failure |
| looks like a context overflow.""" |
|
|
| def test_detects_context_keywords(self): |
| """Error messages containing context-related keywords should be |
| identified as context failures.""" |
| keywords = [ |
| "context length exceeded", |
| "too many tokens in the prompt", |
| "request entity too large", |
| "payload too large for model", |
| "context window exceeded", |
| ] |
| for error_str in keywords: |
| _is_ctx_fail = any(p in error_str.lower() for p in ( |
| "context", "token", "too large", "too long", |
| "exceed", "payload", |
| )) |
| assert _is_ctx_fail, f"Should detect: {error_str}" |
|
|
| def test_detects_generic_400_with_large_history(self): |
| """A generic 400 error code in the string with a large history |
| should be flagged as context failure.""" |
| error_str = "error code: 400 - {'type': 'error', 'message': 'Error'}" |
| history_len = 100 |
|
|
| _is_ctx_fail = any(p in error_str.lower() for p in ( |
| "context", "token", "too large", "too long", |
| "exceed", "payload", |
| )) or ( |
| "400" in error_str.lower() |
| and history_len > 50 |
| ) |
| assert _is_ctx_fail |
|
|
| def test_unrelated_error_not_flagged(self): |
| """Unrelated errors should not be flagged as context failures.""" |
| error_str = "invalid api key: authentication failed" |
| history_len = 10 |
|
|
| _is_ctx_fail = any(p in error_str.lower() for p in ( |
| "context", "token", "too large", "too long", |
| "exceed", "payload", |
| )) or ( |
| "400" in error_str.lower() |
| and history_len > 50 |
| ) |
| assert not _is_ctx_fail |
|
|
|
|
| |
| |
| |
|
|
| class TestAgentSkipsPersistenceForLargeFailedSessions: |
| """When a 400 error occurs and the session is large, the agent |
| should skip persisting to prevent the growth loop.""" |
|
|
| def test_large_session_400_skips_persistence(self): |
| """Status 400 + high token count should skip persistence.""" |
| status_code = 400 |
| approx_tokens = 60000 |
| api_messages = [{"role": "user", "content": "x"}] * 10 |
|
|
| should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80) |
| assert should_skip |
|
|
| def test_small_session_400_persists_normally(self): |
| """Status 400 + small session should still persist.""" |
| status_code = 400 |
| approx_tokens = 5000 |
| api_messages = [{"role": "user", "content": "x"}] * 10 |
|
|
| should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80) |
| assert not should_skip |
|
|
| def test_non_400_error_persists_normally(self): |
| """Non-400 errors should always persist normally.""" |
| status_code = 401 |
| approx_tokens = 100000 |
| api_messages = [{"role": "user", "content": "x"}] * 100 |
|
|
| should_skip = status_code == 400 and (approx_tokens > 50000 or len(api_messages) > 80) |
| assert not should_skip |
|
|